rm(list=ls())
library(ezids)
library(tidyverse)
library(janitor)
library(scales)
library(ggrepel)
library(corrplot)
library(tigris)
library(sf)
library(ggridges)
library(scales)
library(tidycensus)
library(here)
library(car)
library(caTools)
library(glmnet)
library(randomForest)
library(rpart)
library(rpart.plot)
library(rattle)
library(kableExtra)
library(ModelMetrics)
This project uses the cbsa-level 2024 Realtor.com dataset referenced as the source data from our Project 1 data source: the American Enterprise Institute. We enrich this dataset with 1-year 2024 data from the American Community Survey. The variables are:
| Original Variable Name | New Variable Name | Definition |
|---|---|---|
| month_date_yyyymm | month_date_yyyymm | Month/Year date in Realtor.com dataset |
| cbsa_code | cbsa_code | Core-Based Statistical Area (CBSA) code |
| cbsa_title | cbsa_title | CBSA name/title |
| median_listing_price_per_square_foot | median_listing_price_per_square_foot | Median listing price per square foot |
| total_listing_count | total_listing_count | Total number of listings in the CBSA/month |
| pending_ratio | pending_ratio | Share of listings pending sale |
| median_days_on_market | median_days_on_market | Median number of days listings are on the market |
| log_median_price_sqft | log_median_price_sqft | Log of median listing price per square foot |
| DP05_0001E | total_population | Total population (ACS) |
| DP05_0003PE | pct_female | Percent female |
| DP05_0037PE | pct_white | Percent White alone |
| DP05_0045PE | pct_black | Percent Black alone |
| DP05_0053PE | pct_aian | Percent American Indian/Alaska Native |
| DP05_0061PE | pct_asian | Percent Asian alone |
| DP05_0069PE | pct_nhpi | Percent Native Hawaiian/Pacific Islander |
| DP05_0075PE | pct_two_or_more | Percent two or more races |
| DP05_0090PE | pct_hispanic | Percent Hispanic or Latino |
| DP05_0018E | median_age | Median age |
| DP05_0019PE | pct_under_18 | Percent under 18 years old |
| DP02_0001E | total_households | Total households |
| DP02_0002PE | pct_married_couple | Percent married-couple households |
| DP02_0006PE | pct_male_householder | Percent male householder, no spouse |
| DP02_0010PE | pct_female_householder | Percent female householder, no spouse |
| DP03_0005PE | pct_unemployed | Percent unemployed |
| DP03_0119PE | pct_below_poverty | Percent below poverty level |
| DP03_0053PE | pct_income_10_14999 | Percent income $10k–$14,999 |
| DP03_0054PE | pct_income_15_24999 | Percent income $15k–$24,999 |
| DP03_0055PE | pct_income_25_34999 | Percent income $25k–$34,999 |
| DP03_0056PE | pct_income_35_49999 | Percent income $35k–$49,999 |
| DP03_0057PE | pct_income_50_74999 | Percent income $50k–$74,999 |
| DP03_0058PE | pct_income_75_99999 | Percent income $75k–$99,999 |
| DP03_0059PE | pct_income_100_149k | Percent income $100k–$149k |
| DP03_0060PE | pct_income_150_199k | Percent income $150k–$199k |
| DP03_0061PE | pct_income_200k_plus | Percent income $200k+ |
| DP03_0062E | median_household_income | Median household income |
| DP02_0067PE | pct_high_school_plus | Percent high school or higher |
| DP02_0068PE | pct_bachelors_plus | Percent bachelor’s degree or higher |
| DP04_0001E | total_housing_units | Total housing units |
| DP04_0003PE | pct_vacant_units | Percent vacant housing units |
| DP04_0046PE | pct_owner_occupied | Percent owner-occupied housing units |
| DP04_0017PE | pct_built_2020_plus | Percent built 2020+ |
| DP04_0018PE | pct_built_2010_2019 | Percent built 2010–2019 |
| DP04_0019PE | pct_built_2000_2009 | Percent built 2000–2009 |
| DP04_0020PE | pct_built_1990_1999 | Percent built 1990–1999 |
| DP04_0021PE | pct_built_1980_1989 | Percent built 1980–1989 |
| DP04_0022PE | pct_built_1970_1979 | Percent built 1970–1979 |
| DP04_0023PE | pct_built_1960_1969 | Percent built 1960–1969 |
| DP04_0024PE | pct_built_1950_1959 | Percent built 1950–1959 |
| DP04_0025PE | pct_built_1940_1949 | Percent built 1940–1949 |
| DP04_0040PE | pct_1_bed | Percent 1-bedroom units |
| DP04_0041PE | pct_2_bed | Percent 2-bedroom units |
| DP04_0042PE | pct_3_bed | Percent 3-bedroom units |
| DP04_0043PE | pct_4_bed | Percent 4-bedroom units |
| DP04_0044PE | pct_5plus_bed | Percent 5+ bedroom units |
#ACS variable codes
#use to make a dictionary for the report
#demographics
#DP05_0001E - Total Population
#DP05_0003PE – % Female
#DP05_0037PE – % White alone
#DP05_0045PE – % Black or African American alone
#DP05_0053PE - % American Indian and Alaska Native alone
#DP05_0061PE – % Asian alone
#DP05_0069PE - % Native Hawaiian and Other Pacific Islander alone
#DP05_0075PE - % Two or More Races
#DP05_0090PE – % Hispanic or Latino
#DP05_0018E - Median age (years)
#DP05_0019PE - % Under 18 years
#households
#DP02_0001E – Total households
#DP02_0002PE – % Married-couple households
#DP02_0006PE – % Male householder, no spouse
#DP02_0010PE – % Female householder, no spouse
#economics
#DP03_0005PE – % Unemployed
#DP03_0119PE – % Below poverty level
#DP03_0052PE - % Household income <$10,000 (ELIMINATE FOR MULTICOLLINEARITY)
#DP03_0053PE - % Household income $10,000 to $14,999
#DP03_0054PE - % Household income $15,000 to $24,999
#DP03_0055PE - % Household income $25,000 to $34,999
#DP03_0056PE - % Household income $35,000 to $49,999
#DP03_0057PE - % Household income $50,000 to $74,999
#DP03_0058PE – % Household income $75,000 to $99,999
#DP03_0059PE – % Household income $100,000 to $149,999
#DP03_0060PE – % Household income $150,000 to $199,999
#DP03_0061PE – % Household income $200,000 or more
#DP03_0062E - Median household income (dollars)
#DP02_0067PE - % High school graduate or higher
#DP02_0068PE – % Bachelor’s degree or higher
#housing
#DP04_0001E – Total housing units
#DP04_0003PE – % Vacant units
#DP04_0046PE – % of occupied that are Owner-occupied
#DP04_0017PE - % Units built 2020 or later
#DP04_0018PE - % Units built 2010 to 2019
#DP04_0019PE – % Units built 2000 to 2009
#DP04_0020PE – % Units built 1990 to 1999
#DP04_0021PE – % Units built 1980 to 1989
#DP04_0022PE – % Units built 1970 to 1979
#DP04_0023PE – % Units built 1960 to 1969
#DP04_0024PE – % Units built 1950 to 1959
#DP04_0025PE – % Units built 1940 to 1949
#DP04_0026PE – % Units built 1939 or earlier (ELIMINATE FOR MULTICOLLINEARITY)
#DP04_0039PE - % 0 Bedrooms (ELIMINATE FOR MULTICOLLINEARITY)
#DP04_0040PE - % 1 Bedrooms
#DP04_0041PE - % 2 Bedrooms
#DP04_0042PE - % 3 Bedrooms
#DP04_0043PE - % 4 Bedrooms
#DP04_0044PE - % 5 Bedrooms or more
census_api_key("29e7dfea2f8b253a0a10ccd9626f78e49f4f0a4f")
# census_vars = load_variables(year = 2024, dataset = "acs1/profile", cache = TRUE)
# census_vars_limited = grepv("^(?!DP02PR).*P$", census_vars$name, perl=TRUE)
# acs1_2024 = get_acs(geography = "metropolitan statistical area/micropolitan statistical area",
# variables = census_vars_limited,
# year = 2024,
# survey = "acs1",
# output = "wide")
# head(acs1_2024)
vars_needed <- c("NAME",
"GEOID",
"DP05_0001E",
"DP05_0003PE",
"DP05_0037PE",
"DP05_0045PE",
"DP05_0053PE",
"DP05_0061PE",
"DP05_0069PE",
"DP05_0075PE",
"DP05_0090PE",
"DP05_0018E",
"DP05_0019PE",
"DP02_0001E",
"DP02_0002PE",
"DP02_0006PE",
"DP02_0010PE",
"DP03_0005PE",
"DP03_0119PE",
# "DP03_0052PE",
"DP03_0053PE",
"DP03_0054PE",
"DP03_0055PE",
"DP03_0056PE",
"DP03_0057PE",
"DP03_0058PE",
"DP03_0059PE",
"DP03_0060PE",
"DP03_0061PE",
"DP03_0062E",
"DP02_0067PE",
"DP02_0068PE",
"DP04_0001E",
"DP04_0003PE",
"DP04_0046PE",
"DP04_0017PE",
"DP04_0018PE",
"DP04_0019PE",
"DP04_0020PE",
"DP04_0021PE",
"DP04_0022PE",
"DP04_0023PE",
"DP04_0024PE",
"DP04_0025PE",
# "DP04_0026PE",
# "DP04_0039PE",
"DP04_0040PE",
"DP04_0041PE",
"DP04_0042PE",
"DP04_0043PE",
"DP04_0044PE")
acs1_2024_orig = get_acs(geography = "metropolitan statistical area/micropolitan statistical area",
variables = vars_needed,
year = 2024,
survey = "acs1",
output = "wide")
head(acs1_2024_orig)
## # A tibble: 6 × 94
## GEOID NAME DP05_0001E DP05_0001M DP05_0003PE DP05_0003PM DP05_0037PE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10140 Aberdeen, WA … 77893 NA 49.5 0.8 75.5
## 2 10180 Abilene, TX M… 181969 3181 48.2 0.7 66.7
## 3 10300 Adrian, MI Mi… 97746 NA 50.2 0.6 85.6
## 4 10380 Aguadilla, PR… 250969 NA 51.4 1 52.1
## 5 10420 Akron, OH Met… 702209 NA 51.5 0.2 76
## 6 10460 Alamogordo, N… 69711 NA 47 1.6 53.9
## # ℹ 87 more variables: DP05_0037PM <dbl>, DP05_0045PE <dbl>, DP05_0045PM <dbl>,
## # DP05_0053PE <dbl>, DP05_0053PM <dbl>, DP05_0061PE <dbl>, DP05_0061PM <dbl>,
## # DP05_0069PE <dbl>, DP05_0069PM <dbl>, DP05_0075PE <dbl>, DP05_0075PM <dbl>,
## # DP05_0090PE <dbl>, DP05_0090PM <dbl>, DP05_0018E <dbl>, DP05_0018M <dbl>,
## # DP05_0019PE <dbl>, DP05_0019PM <dbl>, DP02_0001E <dbl>, DP02_0001M <dbl>,
## # DP02_0002PE <dbl>, DP02_0002PM <dbl>, DP02_0006PE <dbl>, DP02_0006PM <dbl>,
## # DP02_0010PE <dbl>, DP02_0010PM <dbl>, DP03_0005PE <dbl>, …
acs1_2024_no_M = select(acs1_2024_orig, -grepv("M$", names(acs1_2024_orig)))
head(acs1_2024_no_M)
## # A tibble: 6 × 48
## GEOID NAME DP05_0001E DP05_0003PE DP05_0037PE DP05_0045PE DP05_0053PE
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10140 Aberdeen, WA… 77893 49.5 75.5 1.3 4.2
## 2 10180 Abilene, TX … 181969 48.2 66.7 8.5 0.4
## 3 10300 Adrian, MI M… 97746 50.2 85.6 2.4 0.2
## 4 10380 Aguadilla, P… 250969 51.4 52.1 1.2 0
## 5 10420 Akron, OH Me… 702209 51.5 76 12.3 0.2
## 6 10460 Alamogordo, … 69711 47 53.9 4.4 6.5
## # ℹ 41 more variables: DP05_0061PE <dbl>, DP05_0069PE <dbl>, DP05_0075PE <dbl>,
## # DP05_0090PE <dbl>, DP05_0018E <dbl>, DP05_0019PE <dbl>, DP02_0001E <dbl>,
## # DP02_0002PE <dbl>, DP02_0006PE <dbl>, DP02_0010PE <dbl>, DP03_0005PE <dbl>,
## # DP03_0119PE <dbl>, DP03_0053PE <dbl>, DP03_0054PE <dbl>, DP03_0055PE <dbl>,
## # DP03_0056PE <dbl>, DP03_0057PE <dbl>, DP03_0058PE <dbl>, DP03_0059PE <dbl>,
## # DP03_0060PE <dbl>, DP03_0061PE <dbl>, DP03_0062E <dbl>, DP02_0067PE <dbl>,
## # DP02_0068PE <dbl>, DP04_0001E <dbl>, DP04_0003PE <dbl>, …
#This includes both percents as well as margins of error. I (Alex) have included code to drop the margins of error.
#rename ACS variable codes to real things
acs1_2024 <- acs1_2024_no_M %>%
rename(total_population = DP05_0001E,
pct_female = DP05_0003PE,
pct_white = DP05_0037PE,
pct_black = DP05_0045PE,
pct_aian = DP05_0053PE,
pct_asian = DP05_0061PE,
pct_nhpi = DP05_0069PE,
pct_two_or_more = DP05_0075PE,
pct_hispanic = DP05_0090PE,
median_age = DP05_0018E,
pct_under_18 = DP05_0019PE,
total_households = DP02_0001E,
pct_married_couple = DP02_0002PE,
pct_male_householder = DP02_0006PE,
pct_female_householder = DP02_0010PE,
pct_unemployed = DP03_0005PE,
pct_below_poverty = DP03_0119PE,
# pct_income_lt10k = DP03_0052PE,
pct_income_10_14999 = DP03_0053PE,
pct_income_15_24999 = DP03_0054PE,
pct_income_25_34999 = DP03_0055PE,
pct_income_35_49999 = DP03_0056PE,
pct_income_50_74999 = DP03_0057PE,
pct_income_75_99999 = DP03_0058PE,
pct_income_100_149k = DP03_0059PE,
pct_income_150_199k = DP03_0060PE,
pct_income_200k_plus = DP03_0061PE,
median_household_income = DP03_0062E,
pct_high_school_plus = DP02_0067PE,
pct_bachelors_plus = DP02_0068PE,
total_housing_units = DP04_0001E,
pct_vacant_units = DP04_0003PE,
pct_owner_occupied = DP04_0046PE,
pct_built_2020_plus = DP04_0017PE,
pct_built_2010_2019 = DP04_0018PE,
pct_built_2000_2009 = DP04_0019PE,
pct_built_1990_1999 = DP04_0020PE,
pct_built_1980_1989 = DP04_0021PE,
pct_built_1970_1979 = DP04_0022PE,
pct_built_1960_1969 = DP04_0023PE,
pct_built_1950_1959 = DP04_0024PE,
pct_built_1940_1949 = DP04_0025PE,
# pct_built_1939_earlier = DP04_0026PE,
# pct_0_bed = DP04_0039PE,
pct_1_bed = DP04_0040PE,
pct_2_bed = DP04_0041PE,
pct_3_bed = DP04_0042PE,
pct_4_bed = DP04_0043PE,
pct_5plus_bed = DP04_0044PE)
# #check it out
head(acs1_2024)
## # A tibble: 6 × 48
## GEOID NAME total_population pct_female pct_white pct_black pct_aian pct_asian
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 10140 Aber… 77893 49.5 75.5 1.3 4.2 1.6
## 2 10180 Abil… 181969 48.2 66.7 8.5 0.4 2
## 3 10300 Adri… 97746 50.2 85.6 2.4 0.2 0.3
## 4 10380 Agua… 250969 51.4 52.1 1.2 0 0
## 5 10420 Akro… 702209 51.5 76 12.3 0.2 4.3
## 6 10460 Alam… 69711 47 53.9 4.4 6.5 1.5
## # ℹ 40 more variables: pct_nhpi <dbl>, pct_two_or_more <dbl>,
## # pct_hispanic <dbl>, median_age <dbl>, pct_under_18 <dbl>,
## # total_households <dbl>, pct_married_couple <dbl>,
## # pct_male_householder <dbl>, pct_female_householder <dbl>,
## # pct_unemployed <dbl>, pct_below_poverty <dbl>, pct_income_10_14999 <dbl>,
## # pct_income_15_24999 <dbl>, pct_income_25_34999 <dbl>,
## # pct_income_35_49999 <dbl>, pct_income_50_74999 <dbl>, …
#I want to make a area type var here before merging
#create area_type var
acs1_2024 <- acs1_2024 %>%
mutate(area_type = ifelse(grepl("Micro Area",NAME), "Micro", "Metro"))
unique(acs1_2024$area_type)
## [1] "Micro" "Metro"
#ilgaz' correlation stuff
# acs_clean <- acs1_2024_pcts %>% drop_na()
# num_df <- acs_clean %>%
# select(where(is.numeric))
# num_df$dummy_target <- rnorm(nrow(num_df))
#model <- lm(dummy_target ~ ., data = num_df)
#check_collinearity(model)
# head(acs_clean)
# num_df <- acs1_2024_pcts %>% select(where(is.numeric))
#
# corr_mat <- cor(num_df, use = "pairwise.complete.obs")
# threshold <- 0.80
#
# high_corr <- which(abs(corr_mat) > threshold,
# arr.ind = TRUE)
#
# # convert to readable table
# high_corr_pairs <- data.frame(
# var1 = rownames(corr_mat)[high_corr[,1]],
# var2 = colnames(corr_mat)[high_corr[,2]],
# corr = corr_mat[high_corr]
# )
#
# # remove duplicates (since matrix is symmetric)
# high_corr_pairs <- high_corr_pairs[high_corr_pairs$var1 < high_corr_pairs$var2, ]
#
#
# high_corr_pairs # 2794 out of 147,696 pairs (around 2%) have an absolute corr >= .80
#
# #steph's sidequest
# sq_acs <- acs1_2024_pcts
#
# #numeric-only subset
# sq_acs_num <- sq_acs %>%
# select(where(is.numeric))
#
# #ID'ing fake percents and set to na
# sq_acs_num <- sq_acs_num %>%
# mutate(across(everything(),
# ~ ifelse(. > 100, NA, .)))
#
# #dropping all na columns
# sq_acs_num <- sq_acs_num %>%
# select(where(~ !all(is.na(.))))
#
# #converting NA values to median
# sq_acs_num <- sq_acs_num %>%
# mutate(across(everything(),
# ~ ifelse(is.na(.),
# median(., na.rm = TRUE),
# .)))
#
# #dropping constant or near-constant cols
# sq_acs_num <- sq_acs_num %>%
# select(where(function(col) {
# s <- sd(col, na.rm = TRUE)
# if (is.na(s)) s <- 0
# s > 10
# }))
#
# #dummy target
# set.seed(123)
# sq_acs_num$fake <- rnorm(nrow(sq_acs_num))
#
# #fitting dummy linear model
# sq_acs_mod <- lm(fake ~ ., data = sq_acs_num)
#
# #computing vif
# compute_vif <- function(df){
# vars <- colnames(df)
# out <- tibble(variable = vars, vif = NA_real_)
#
# for(i in seq_along(vars)){
# y <- df[[i]]
# X <- df[, -i, drop = FALSE]
#
# #fit model for this variable alone
# mod <- tryCatch(lm(y ~ ., data = as.data.frame(X)),
# error = function(e) NULL)
#
# if(!is.null(mod)){
# out$vif[i] <- tryCatch(
# max(car::vif(mod)),
# error = function(e) NA_real_
# )
# }
# }
#
# out
# }
#
# vif_results <- compute_vif(sq_acs_num)
#
# #sort highest VIF at top
# vif_results_sorted <- vif_results %>% arrange(desc(vif))
#
# vif_results_sorted
#team, should we just delete this chunk?- BW
#which variables to keep
# #filter down the dataset
# acs1_2024_filtered <- acs1_2024_pcts[, c("GEOID", "NAME", vars_needed)]
#
# #rename ACS variable codes to real things
# acs1_2024_filtered <- acs1_2024_filtered %>%
# rename(total_population = DP05_0001E,
# pct_female = DP05_0003PE,
# pct_white = DP05_0037PE,
# pct_black = DP05_0045PE,
# pct_aian = DP05_0053PE,
# pct_asian = DP05_0061PE,
# pct_nhpi = DP05_0069PE,
# pct_two_or_more = DP05_0075PE,
# pct_hispanic = DP05_0090PE,
# median_age = DP05_0018E,
# pct_under_18 = DP05_0019PE,
# total_households = DP02_0001E,
# pct_married_couple = DP02_0002PE,
# pct_male_householder = DP02_0006PE,
# pct_female_householder = DP02_0010PE,
# pct_unemployed = DP03_0005PE,
# pct_below_poverty = DP03_0119PE,
# pct_income_lt10k = DP03_0052PE,
# pct_income_10_14999 = DP03_0053PE,
# pct_income_15_24999 = DP03_0054PE,
# pct_income_25_34999 = DP03_0055PE,
# pct_income_35_49999 = DP03_0056PE,
# pct_income_50_74999 = DP03_0057PE,
# pct_income_75_99999 = DP03_0058PE,
# pct_income_100_149k = DP03_0059PE,
# pct_income_150_199k = DP03_0060PE,
# pct_income_200k_plus = DP03_0061PE,
# median_household_income = DP03_0062E,
# pct_high_school_plus = DP02_0067PE,
# pct_bachelors_plus = DP02_0068PE,
# total_housing_units = DP04_0001E,
# pct_vacant_units = DP04_0003PE,
# pct_owner_occupied = DP04_0046PE,
# pct_built_2020_plus = DP04_0017PE,
# pct_built_2010_2019 = DP04_0018PE,
# pct_built_2000_2009 = DP04_0019PE,
# pct_built_1990_1999 = DP04_0020PE,
# pct_built_1980_1989 = DP04_0021PE,
# pct_built_1970_1979 = DP04_0022PE,
# pct_built_1960_1969 = DP04_0023PE,
# pct_built_1950_1959 = DP04_0024PE,
# pct_built_1940_1949 = DP04_0025PE,
# pct_built_1939_earlier = DP04_0026PE,
# pct_0_bed = DP04_0039PE,
# pct_1_bed = DP04_0040PE,
# pct_2_bed = DP04_0041PE,
# pct_3_bed = DP04_0042PE,
# pct_4_bed = DP04_0043PE,
# pct_5plus_bed = DP04_0044PE)
#
# # #check it out
# head(acs1_2024_filtered)
#
# # remove a few collinear / unusable vars
# # acs1_2024_filtered <- acs1_2024_filtered %>%
# # select(-pct_male,
# # -pct_hispanic,
# # -pct_65_over,
# # -pct_renter_occupied)
#
# #I want to make a area type var here before merging
# #create area_type var
# acs1_2024_filtered <- acs1_2024_filtered %>%
# mutate(area_type = ifelse(grepl("Micro Area",NAME), "Micro", "Metro"))
#
# unique(acs1_2024_filtered$area_type)
#team, I think we should delete this chunk too. -BW
#loading in inventory data
inventory_raw <- read_csv(here("../RDC_Inventory_Core_Metrics_Metro_History.csv"))
#cleaning names
clean_names(inventory_raw)
## # A tibble: 103,600 × 47
## month_date_yyyymm cbsa_code cbsa_title household_rank median_listing_price
## <dbl> <dbl> <chr> <dbl> <dbl>
## 1 202510 35620 New York-New… 1 762450
## 2 202510 31080 Los Angeles-… 2 1099000
## 3 202510 16980 Chicago-Nape… 3 364900
## 4 202510 19100 Dallas-Fort … 4 425000
## 5 202510 26420 Houston-Pasa… 5 358000
## 6 202510 37980 Philadelphia… 6 379973
## 7 202510 33100 Miami-Fort L… 7 499999
## 8 202510 12060 Atlanta-Sand… 8 415000
## 9 202510 47900 Washington-A… 9 594500
## 10 202510 38060 Phoenix-Mesa… 10 495000
## # ℹ 103,590 more rows
## # ℹ 42 more variables: median_listing_price_mm <dbl>,
## # median_listing_price_yy <dbl>, active_listing_count <dbl>,
## # active_listing_count_mm <dbl>, active_listing_count_yy <dbl>,
## # median_days_on_market <dbl>, median_days_on_market_mm <dbl>,
## # median_days_on_market_yy <dbl>, new_listing_count <dbl>,
## # new_listing_count_mm <dbl>, new_listing_count_yy <dbl>, …
#hunting down year
glimpse(inventory_raw)
## Rows: 103,600
## Columns: 47
## $ month_date_yyyymm <dbl> 202510, 202510, 202510, 202510…
## $ cbsa_code <dbl> 35620, 31080, 16980, 19100, 26…
## $ cbsa_title <chr> "New York-Newark-Jersey City, …
## $ HouseholdRank <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,…
## $ median_listing_price <dbl> 762450, 1099000, 364900, 42500…
## $ median_listing_price_mm <dbl> 0.0039, 0.0000, -0.0200, 0.000…
## $ median_listing_price_yy <dbl> -0.0160, -0.0443, -0.0138, -0.…
## $ active_listing_count <dbl> 36156, 18409, 16751, 29448, 33…
## $ active_listing_count_mm <dbl> 0.0074, -0.0470, -0.0048, -0.0…
## $ active_listing_count_yy <dbl> 0.0388, 0.1787, -0.0175, 0.136…
## $ median_days_on_market <dbl> 54, 57, 38, 64, 57, 43, 86, 60…
## $ median_days_on_market_mm <dbl> -0.0609, 0.0000, 0.0133, 0.015…
## $ median_days_on_market_yy <dbl> 0.0093, 0.1875, 0.0270, 0.1327…
## $ new_listing_count <dbl> 13832, 7832, 8644, 8794, 9428,…
## $ new_listing_count_mm <dbl> -0.1703, -0.0046, -0.1044, -0.…
## $ new_listing_count_yy <dbl> 0.0716, 0.0158, -0.0476, -0.04…
## $ price_increased_count <dbl> 362, 330, 160, 448, 1304, 266,…
## $ price_increased_count_mm <dbl> 0.0838, 0.1074, -0.2308, -0.05…
## $ price_increased_count_yy <dbl> -0.1084, 0.0123, -0.3043, -0.2…
## $ price_increased_share <dbl> 0.0046, 0.0100, 0.0043, 0.0094…
## $ price_increased_share_mm <dbl> 0.0002, 0.0015, -0.0011, -0.00…
## $ price_increased_share_yy <dbl> -0.0016, -0.0006, -0.0019, -0.…
## $ price_reduced_count <dbl> 7062, 5260, 6308, 13318, 10622…
## $ price_reduced_count_mm <dbl> 0.0282, -0.0187, 0.0003, -0.00…
## $ price_reduced_count_yy <dbl> 0.2059, 0.2233, 0.0805, 0.1428…
## $ price_reduced_share <dbl> 0.0964, 0.1547, 0.1704, 0.2803…
## $ price_reduced_share_mm <dbl> 0.0054, 0.0023, 0.0067, 0.0068…
## $ price_reduced_share_yy <dbl> 0.0064, 0.0146, 0.0124, 0.0206…
## $ pending_listing_count <dbl> 23266, 7260, 11372, 9225, 9594…
## $ pending_listing_count_mm <dbl> -0.0015, -0.0285, -0.0487, -0.…
## $ pending_listing_count_yy <dbl> 0.3075, -0.0204, 0.0441, -0.05…
## $ median_listing_price_per_square_foot <dbl> 515, 662, 211, 202, 172, 230, …
## $ median_listing_price_per_square_foot_mm <dbl> 0.0174, -0.0046, -0.0089, -0.0…
## $ median_listing_price_per_square_foot_yy <dbl> -0.0346, -0.0232, 0.0000, -0.0…
## $ median_square_feet <dbl> 1498, 1747, 1727, 2166, 2097, …
## $ median_square_feet_mm <dbl> -0.0013, 0.0023, -0.0089, -0.0…
## $ median_square_feet_yy <dbl> -0.0360, -0.0199, -0.0265, 0.0…
## $ average_listing_price <dbl> 1580160, 2274862, 526075, 6210…
## $ average_listing_price_mm <dbl> 0.0214, 0.0052, -0.0146, 0.010…
## $ average_listing_price_yy <dbl> -0.0468, -0.0408, -0.0298, 0.0…
## $ total_listing_count <dbl> 58998, 25629, 28103, 38647, 43…
## $ total_listing_count_mm <dbl> 0.0050, -0.0432, -0.0209, -0.0…
## $ total_listing_count_yy <dbl> 0.1307, 0.1152, 0.0081, 0.0828…
## $ pending_ratio <dbl> 0.643, 0.394, 0.679, 0.313, 0.…
## $ pending_ratio_mm <dbl> -0.0057, 0.0075, -0.0314, -0.0…
## $ pending_ratio_yy <dbl> 0.1323, -0.0802, 0.0400, -0.06…
## $ quality_flag <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
#filtering for 2024
inventory_2024 <- inventory_raw %>%
filter(month_date_yyyymm >= 202401 & month_date_yyyymm <= 202412)
#filtered for target vars
inventory_2024_filtered <- inventory_2024%>%
select(
month_date_yyyymm,
cbsa_code,
cbsa_title,
median_listing_price_per_square_foot,
total_listing_count,
pending_ratio,
median_days_on_market)%>%
mutate(
log_median_price_sqft = log(median_listing_price_per_square_foot)) %>%
drop_na()
summary(inventory_2024_filtered)
## month_date_yyyymm cbsa_code cbsa_title
## Min. :202401 Min. :10100 Length:11052
## 1st Qu.:202404 1st Qu.:20100 Class :character
## Median :202407 Median :29860 Mode :character
## Mean :202406 Mean :29857
## 3rd Qu.:202410 3rd Qu.:39700
## Max. :202412 Max. :49820
## median_listing_price_per_square_foot total_listing_count pending_ratio
## Min. : 48 Min. : 17 Min. :0.002
## 1st Qu.: 134 1st Qu.: 136 1st Qu.:0.318
## Median : 169 Median : 284 Median :0.490
## Mean : 202 Mean : 1255 Mean :0.553
## 3rd Qu.: 224 3rd Qu.: 776 3rd Qu.:0.719
## Max. :1708 Max. :52936 Max. :2.679
## median_days_on_market log_median_price_sqft
## Min. : 9.0 Min. :3.87
## 1st Qu.: 46.0 1st Qu.:4.90
## Median : 58.0 Median :5.13
## Mean : 60.1 Mean :5.19
## 3rd Qu.: 72.0 3rd Qu.:5.41
## Max. :164.0 Max. :7.44
# inventory_2024_grouped <- inventory_2024_filtered %>%
# select(!month_date_yyyymm) %>%
# group_by(cbsa_code, cbsa_title)%>%
# summarise(
# med_median_listing_price_per_square_foot = median(median_listing_price_per_square_foot),
# # avg_log_median_listing_price = mean(log_median_listing_price, na.rm = TRUE),
# med_total_listing_count = median(total_listing_count),
# med_pending_ratio = median(pending_ratio),
# med_median_days_on_market = median(median_days_on_market)) %>%
# ungroup()
#
# print(inventory_2024_grouped)
The Realtor.com inventory dataset provides monthly metro-level housing indicators like listing counts, median prices, and days on market. Unlike the ACS, it doesn’t include paired estimates or margins of error. Each variable is already given as a direct point estimate. Some fields (the _mm and _yy columns) represent month-over-month or year-over-year percent changes, which introduce redundancy, so we later drop those before modeling. Aside from that, the data only need to be filtered to the 2024 months and are ready for use.
#cbsa code var to chr so merge works
inventory_2024_filtered <- inventory_2024_filtered %>%
mutate(cbsa_code = as.character(cbsa_code))
#merge ACS and inventory datasets
full_dataset <- acs1_2024 %>%
left_join(inventory_2024_filtered, by = c("GEOID" = "cbsa_code"))
#which vars to factor
factor_vars <- c("GEOID",
# "NAME",
"cbsa_title",
# "HouseholdRank",
# "quality_flag",
"area_type")
#drop redundant columns
redundant_vars <- c("NAME")
#clean dataset
full_dataset_clean <- full_dataset %>%
select(-all_of(redundant_vars)) %>% # Drop redundant columns
mutate(across(all_of(factor_vars), as.factor)) #%>% # Convert factor variables
# mutate(log_median_listing_price = log(median_listing_price)) # Log home price
#NAs by variable type
na_summary_one <- full_dataset_clean %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(everything(), names_to = "variable", values_to = "NA_count") %>%
mutate(var_type = ifelse(variable %in% factor_vars, "factor", "numeric")) %>%
arrange(desc(NA_count))
print(na_summary_one)
## # A tibble: 55 × 3
## variable NA_count var_type
## <chr> <int> <chr>
## 1 pct_white 60 numeric
## 2 pct_black 60 numeric
## 3 pct_aian 60 numeric
## 4 pct_asian 60 numeric
## 5 pct_nhpi 60 numeric
## 6 pct_two_or_more 60 numeric
## 7 total_households 6 numeric
## 8 pct_married_couple 6 numeric
## 9 pct_male_householder 6 numeric
## 10 pct_female_householder 6 numeric
## # ℹ 45 more rows
#address NAs (drop hispanic, when organized by FIPS code, nearest neighbor for ages, replace remaining NAs with 0)
#drop hispanic
#commenting out for now since some of this was done earlier and would like to convene before moving/removing
# full_dataset_clean <- full_dataset_clean %>%
# select(-pct_hispanic)
#
# #when organized by FIPS code, nearest neighbor for ages
# age_vars <- c("pct_under_18", "pct_65_over")
#
# full_dataset_clean <- full_dataset_clean %>%
# arrange(GEOID, month_date_yyyymm) %>%
# group_by(GEOID) %>%
# mutate(across(all_of(age_vars),
# ~ if(sum(!is.na(.)) >= 2) {
# approx(x = seq_along(.),
# y = .,
# xout = seq_along(.),
# method = "linear",
# rule = 2)$y
# } else {
# . # leave as-is (NA)
# })) %>%
# ungroup()
#
# na_counts_two <- full_dataset_clean %>%
# summarise(across(everything(), ~ sum(is.na(.)))) %>%
# pivot_longer(everything(), names_to = "variable", values_to = "na_count") %>%
# arrange(desc(na_count))
#
# na_counts_two
#remove remaining NAs
full_dataset_clean <- full_dataset_clean %>%
drop_na()
#got rid of 86 observations, but we still have over 6k
nrow(full_dataset_clean)
## [1] 6280
After merging the data to create our holistic dataset, we only dropped 86 rows to remove NA values. This left us with 6,280 observations remaining- more than enough to run a reliable analysis. To prepare for our analysis, we started by creating a summary table of key metrics around some of our key variables.
summary_long <- full_dataset_clean %>%
summarise(median_price = median(median_listing_price_per_square_foot, na.rm = TRUE),
mean_price = mean(median_listing_price_per_square_foot, na.rm = TRUE),
sd_price = sd(median_listing_price_per_square_foot, na.rm = TRUE),
median_supply = median(total_listing_count, na.rm = TRUE),
mean_supply = mean(total_listing_count, na.rm = TRUE),
sd_supply = sd(total_listing_count, na.rm = TRUE),
median_income = median(median_household_income, na.rm = TRUE),
mean_income = mean(median_household_income, na.rm = TRUE),
sd_income = sd(median_household_income, na.rm = TRUE),
median_pending_ratio = median(pending_ratio, na.rm = TRUE),
mean_pending_ratio = mean(pending_ratio, na.rm = TRUE),
sd_pending_ratio = sd(pending_ratio, na.rm = TRUE)) %>%
pivot_longer(everything(), names_to = "Metric", values_to = "Value")
summary_table_kable <- kable(summary_long, caption = "Summary Statistics") %>%
kable_styling(full_width = F, bootstrap_options = c("striped", "hover"))
summary_table_kable
| Metric | Value |
|---|---|
| median_price | 187.000 |
| mean_price | 219.229 |
| sd_price | 129.567 |
| median_supply | 626.000 |
| mean_supply | 2089.634 |
| sd_supply | 4826.752 |
| median_income | 71632.000 |
| mean_income | 73933.927 |
| sd_income | 15025.725 |
| median_pending_ratio | 0.538 |
| mean_pending_ratio | 0.599 |
| sd_pending_ratio | 0.336 |
Even before graphing and investigating our dataset further with exploratory data analysis (EDA), the summary statistics tell us that all of our key variables have a right-skew. For price, income, and pending ratio, this is a slight right-skew, likely as a result of a few, large outliers. However, for our supply variable, there is an extreme right-skew and a very large standard deviation. This is likely due to the structure of the variable instead of a fundamental error in the dataset, and the supply distribution plot in the EDA section will add further context to this variable summary.
As this is a new dataset to our team, we needed to start our project with some high-level exploratory data analysis. This data analysis helped us frame our approach to our statistical analysis and predictive models later in the project. For a few of our variables, we graphed both the raw variable distributions and also logged versions of these variables to ensure that we can capture larger relationships independent of influential outliers.
price_dist_plt= ggplot(inventory_2024_filtered, aes(x = median_listing_price_per_square_foot)) +
geom_histogram(binwidth = 50, fill = "steelblue", color = "black") +
labs(
title = "Distribution of Median Listing Price per Square Foot",
subtitle = "Median Price per Square Foot is heavily right-skewed",
x = "Median Price per Square Foot",
y = "Count",
caption = "SOURCE: Realtor.com") +
scale_x_continuous(labels= dollar_format(accuracy=1))+
theme_minimal(base_size = 11)+
theme(plot.title=element_text(face="bold",size=14,margin=ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10,color="gray25",margin=ggplot2::margin(b=8)),
plot.caption = element_text(size=8,color="gray40",hjust=0),
axis.title = element_text(size=10,face="bold"),
axis.text = element_text(size=9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
ggsave(filename = "price_distribution.png",
plot = price_dist_plt,
width = 8,
height = 5,
dpi = 300)
price_dist_plt
The first distribution we plotted was median price per square foot- our main dependent variable. The graph shows that a majority of median listing prices per square foot fall below $500 with a handful of very high values pulling the mean of the graph higher than the median.
logged_price_dist= ggplot(inventory_2024_filtered, aes(x = log_median_price_sqft)) +
geom_histogram(binwidth = 0.1, fill = "steelblue", color = "black") +
labs(title = "Distribution of Log Median Listing Price per Square Foot",
subtitle = "Log transformation reduces right skew and reveals the underlying distribution",
x = "Log Median Price per Square Foot",
y = "Count",
caption = "SOURCE: Realtor.com") +
scale_x_continuous(
labels = function(x) dollar(exp(x), accuracy = 1)) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face = "bold", size = 14, margin = ggplot2::margin(b = 3)),
plot.subtitle = element_text(size = 10, color = "gray25", margin = ggplot2::margin(b = 8)),
plot.caption = element_text(size = 8, color = "gray40", hjust = 0),
axis.title = element_text(size = 10, face = "bold"),
axis.text = element_text(size = 9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
ggsave(filename = "logged_price_distribution.png",
plot = logged_price_dist,
width = 8,
height = 5,
dpi = 300)
logged_price_dist
Plotting the logged median price per square foot shows a distribution much closer to the normal curve. Logging the price mitigated the heavy influence of of outlier values and creates a more robust source from which to draw relationship insights between variables.
housing_violion<- ggplot(inventory_2024_filtered, aes(x = "", y = total_listing_count)) +
geom_violin(fill = "lightblue", color = "steelblue", alpha = 0.7) +
geom_jitter(width = 0.1, alpha = 0.3, color = "darkblue") +
labs(title = "Distribution of Total Listing Count",
subtitle = "Before transformation, the distribution is heavily centered at the lower price range",
x = "",
y = "Total Listings",
caption = "SOURCE: Realtor.com") +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
housing_violion
This graph attempts to draw further insights into the distribution of listing counts among our cbsa units, but the concentration falling so heavily near the bottom of the y-axis makes it difficult to specifically interpret the supply distribution. To address this, we also plotted the log of supply below.
log_housing_violin_plt= ggplot(inventory_2024_filtered, aes(x = "", y = total_listing_count)) +
geom_violin(fill = "lightblue", color = "steelblue", alpha = 0.7) +
geom_jitter(width = 0.1, alpha = 0.3, color = "darkblue") +
scale_y_log10() +
labs(title = "Distribution of Total Listing Count (Log Scale)",
subtitle = "The distribution density of total listing count is more easily visualized\nwith a log transformation and violin plot",
x = "",
y = "Log of Total Listings",
caption = "SOURCE: Realtor.com") +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
ggsave(filename = "logged_supply_violin.png",
plot = log_housing_violin_plt,
width = 8,
height = 5,
dpi = 300)
log_housing_violin_plt
This graph shows the log distribution of listing supply overlayed on top of a violin plot of the same data. This added dimension and logged variable adds much more insight into the distribution of the supply variable. As the graph demonstrates, a majority of the supply variables land near the bottom of the y-axis, indicating that there are vastly more cbsa with low listing counts than high listing counts. This is likely due to the structure of our cbsa data. Micro areas can be cbsa regions with 10,000-50,000 residents, and macro areas are any over 50,000 residents. Most small towns fall in the micro level, whereas some massive metropolitan areas (like New York, Chicago, Los Angeles) have millions of residents. This leads to a vast majority of the units falling on the lower end of the spectrum with large, meaninful outliers near the top. For further graphs, we also plotted the log of supply to help normalize this distribution and draw out the true nature of the relationships between variables.
price_supply_scatter= ggplot(inventory_2024_filtered, aes(x = total_listing_count, y = median_listing_price_per_square_foot)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
labs(title = "Price vs Total Supply",
subtitle = "Median price per square foot tends to increase as total listings increase",
x = "Total Listing Count",
y = "Median Price per Square Foot",
caption = "SOURCE: Realtor.com") +
scale_y_continuous(labels = dollar_format(accuracy = 1)) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
ggsave(filename = "pricesupply_scatter.png",
plot = price_supply_scatter,
width = 8,
height = 5,
dpi = 300)
price_supply_scatter
This graph plots the relationship between the un-transformed price and supply variables as this is the key relationship driving our SMART question. This plot shows how price appears to be positively correlated with total listing count, but we procedded with log transformations of the variables to be sure.
log_price_supply_scatter <- ggplot(inventory_2024_filtered, aes(x = total_listing_count, y = log_median_price_sqft)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
labs(title = "Log Price vs Total Supply",
subtitle = "Log transformation reduces skew and reveals variation across listings",
x = "Total Listing Count",
y = "Median Price per Square Foot (log scale)",
caption = "SOURCE: Realtor.com") +
scale_y_continuous(labels = function(x) dollar(exp(x), accuracy = 1)) +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
log_price_supply_scatter
After plotting the lof of our price variable against our supply variable, the positive relationship between price and supply appears even stronger than in our previous graph. However, the distribution of supply is still heavily skewed, so we also logged that variable for the next graph.
#log by log scatter with a regression line on top and the actual values associated with the log transformation
log_price_log_supply_scatter<- ggplot(inventory_2024_filtered,
aes(x = log(total_listing_count),
y = log(median_listing_price_per_square_foot))) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
scale_x_continuous(name = "Total Listing Count (log scale)",
labels = function(x) scales::comma(round(exp(x)))) +
scale_y_continuous(
name = "Median Price per Square Foot (log scale)",
labels = function(y) scales::dollar(round(exp(y), 0))) +
labs(title = "Log Price vs Log Total Supply",
subtitle = "Higher listing counts are generally associated with higher listing prices",
caption = "SOURCE: Realtor.com") +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
log_price_log_supply_scatter
In this final version of our price/ supply scatterplot, we compare the log of price with the log of supply, and we see a clear positive trend between price and supply. Log of median price per square foot steadily increases as log of total listing count increases, supporting the idea that changes in housing supply- at least partially- influence changes in housing price.
logprice_logsupply_scatter= inventory_2024_filtered %>%
mutate(listing_bin = ntile(total_listing_count, 5)) %>%
ggplot(aes(x = factor(listing_bin), y = log_median_price_sqft)) +
geom_boxplot(fill = "lightblue", color = "steelblue") +
scale_y_continuous(
name = "Median Price per Square Foot (log scale)",
labels = function(y) scales::dollar(round(exp(y), 0))) +
labs(
title = "Price Distribution by Supply Quantiles",
subtitle= "Higher median log price is associated with higher supply",
x = "Supply Quantile (Total Listings)",
y = "Median Price per Sq Ft (log)",
caption = "SOURCE: Realtor.com") +
theme_minimal(base_size = 11) +
theme(
plot.title = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9))
ggsave(filename = "logpricelogsupply_scatter.png",
plot = logprice_logsupply_scatter,
width = 8,
height = 5,
dpi = 300)
logprice_logsupply_scatter
When separating unlogged supply into quantiles for a boxplot, the positive correlation between these two variables remains plainly visible.
Price_demand_scatter= ggplot(inventory_2024_filtered, aes(x = pending_ratio, y = median_listing_price_per_square_foot)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
scale_y_continuous(labels = dollar_format(accuracy = 1)) +
labs(title = "Price vs Market Tightness (Demand)",
subtitle = "Higher market demand is associated with lower prices",
x = "Pending Ratio (Pending / Active Listings)",
y = "Median Price per Square Foot",
caption = "SOURCE: Realtor.com") +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
ggsave(filename = "pricedemand_scatter.png",
plot = Price_demand_scatter,
width = 8,
height = 5,
dpi = 300)
Price_demand_scatter
The next relationship we investigated was the relationship between housing price and market demand. As is clean in the graph above, plotting median listing price per square foot against pending ratio (our proxy for market demand), reveals a slight negative relationship between price and demand. This is in line with traditional models of supply and demand where where supply is held constant, and higher demand is associated with lower market prices.
logprice_demand_scatter= ggplot(inventory_2024_filtered, aes(x = pending_ratio, y = log_median_price_sqft)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
scale_y_continuous(
labels = function(x) dollar(exp(x), accuracy = 1)) +
labs(title = "Log Price vs Market Tightness (Demand)",
subtitle = "Higher market demand is associated with lower log prices",
x = "Pending Ratio (Pending / Active Listings)",
y = "Median Price per Square Foot (log scale)",
caption = "SOURCE: Realtor.com") +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
ggsave(filename = "logpricedemand_scatter.png",
plot = logprice_demand_scatter,
width = 8,
height = 5,
dpi = 300)
logprice_demand_scatter
We also graphed the relationship between log price and market demand, and the negative correlation between these relationships persists.
price_market_hotness_plot <-ggplot(inventory_2024_filtered, aes(x = median_days_on_market, y = median_listing_price_per_square_foot)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
scale_y_continuous(labels = dollar_format(accuracy = 1)) +
labs(title = "Price vs Market Speed",
subtitle = "The relationship between price and market speed is minimal",
x = "Median Days on Market",
y = "Median Price per Square Foot",
caption = "SOURCE: Realtor.com") +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
price_market_hotness_plot
The next set of variables that we investigated in our exploratory data analysis was price and market speed. The graph above shows a slight positive relationship between price and market speed (represented with the median days on market variable), although the magnitude of the relationship appears small. Additionally, we cannot claim causality in this relationship as the risk of reverse causality is very high between these variables. We would require a much more thorough study to investigate any causal relationship between these variables.
log_price_market_hotness_plot<- ggplot(inventory_2024_filtered, aes(x = median_days_on_market, y = log_median_price_sqft)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "darkred", linewidth = 1) +
scale_y_continuous(labels = function(x) dollar(exp(x), accuracy = 1)) +
labs(title = "Log Price vs Market Speed",
subtitle = "The relationship between log price and market speed is also minimal",
x = "Median Days on Market",
y = "Median Price per Square Foot (log scale)",
caption = "SOURCE: Realtor.com") +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face="bold", size=14, margin = ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin = ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
plot.background = element_blank())
log_price_market_hotness_plot
When we graphed the log price against the market speed, the direction of
the correlation changed. Instead of having a slightly negative
relationship, there now appears to be a very very slight positive
relationship between price and market speed. This graph simply adds
complexity to the relationship between these two variables- if there is
a noteworthy relationship between them at all.
price_income_scatter= ggplot(full_dataset_clean, aes(x = median_household_income,
y = median_listing_price_per_square_foot)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = TRUE, color = "darkred", linewidth = 1) +
scale_x_continuous(labels = scales::dollar) +
scale_y_continuous(labels = scales::dollar) +
labs(title = "Median Price vs Median Household Income",
subtitle = "There appears to be a strong positive relationship between median\nlisting price and median household income",
x = "Median Household Income",
y = "Median Price per Sq Ft",
caption = "SOURCE: Realtor.com & ACS 2024") +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9))
ggsave(filename = "priceincome_scatter.png",
plot = price_income_scatter,
width = 8,
height = 5,
dpi = 300)
price_income_scatter
The final set of variables we analyzed was median price per square foot and median household income. We expected a strong, positive relationship between these variables, and as is plainly visible in the graph above, that is exactly what we found. Again, this is in line with classic models of supply and demand. However, neither of these variables have normal distributions, so we also compared their log transformations.
logprice_logincome_scatter= ggplot(full_dataset_clean) +
geom_point(aes(x = log(median_household_income),
y = log(median_listing_price_per_square_foot)),
alpha = 0.6, color = "steelblue") +
geom_smooth(aes(x = log(median_household_income),
y = log(median_listing_price_per_square_foot)),
method = "lm", se = TRUE, color = "darkred", linewidth = 1) +
scale_x_continuous(name = "Median Household Income (log scale)",
labels = function(x) scales::dollar(exp(x))) +
scale_y_continuous(name = "Median Price per Sq Ft (log scale)",
labels = function(y) scales::dollar(exp(y))) +
labs(title = "Log Price vs Log Median Household Income",
subtitle = "Standardizing both variables only emphasizes the relationship",
caption = "SOURCE: Realtor.com & ACS 2024") +
theme_minimal(base_size = 11) +
theme(plot.title = element_text(face="bold", size=14, margin=ggplot2::margin(b=3)),
plot.subtitle = element_text(size=10, color="gray25", margin=ggplot2::margin(b=8)),
plot.caption = element_text(size=8, color="gray40", hjust=0),
axis.title = element_text(size=10, face="bold"),
axis.text = element_text(size=9))
ggsave(filename = "logpricelogincome_scatter.png",
plot = logprice_logincome_scatter,
width = 8,
height = 5,
dpi = 300)
logprice_logincome_scatter
When plotting log price and log income, the positive relationship
between the variables is of an even larger magnitude than the
un-transformed versions of the variables. This demonstrates a clear and
strong relationship and the potential significance of income as a
predictor in our models.
EDA cannot establish causality by itself, but it does provide a strong starting point for more complex statistical testing of the data.
set.seed(4)
targets = c("median_listing_price_per_square_foot", "log_median_price_sqft")
split = sample.split(full_dataset_clean$GEOID, SplitRatio = 0.8)
train = subset(full_dataset_clean %>% select(-"GEOID"), split == TRUE)
test = subset(full_dataset_clean %>% select(-"GEOID"), split == FALSE)
# train
price_train = train$median_listing_price_per_square_foot
log_price_train = train$log_median_price_sqft
X_train = train %>% select(-all_of(targets))
X_train_matrix = model.matrix(~.-1,X_train)
X_train_no_cbsa = X_train %>% select(-cbsa_title)
X_train_no_cbsa_matrix = model.matrix(~.-1,X_train_no_cbsa)
# test
price_test = test$median_listing_price_per_square_foot
log_price_test = test$log_median_price_sqft
X_test = test %>% select(-all_of(targets))
X_test_matrix = model.matrix(~.-1,X_test)
X_test_no_cbsa = X_test %>% select(-cbsa_title)
X_test_no_cbsa_matrix = model.matrix(~.-1,X_test_no_cbsa)
# names(full_dataset_clean)
#reg d- "avg_log_median_listing_price" i- "total_active_listing_count"
reg_model_by_1 <- lm(median_listing_price_per_square_foot ~ total_listing_count,
data = train)
summary(reg_model_by_1)
##
## Call:
## lm(formula = median_listing_price_per_square_foot ~ total_listing_count,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -204.8 -68.2 -30.2 23.0 1065.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 209.640034 1.921284 109.1 <2e-16 ***
## total_listing_count 0.004556 0.000366 12.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 128 on 5231 degrees of freedom
## Multiple R-squared: 0.0288, Adjusted R-squared: 0.0286
## F-statistic: 155 on 1 and 5231 DF, p-value: <2e-16
#reg d- "avg_log_median_listing_price" i- "total_active_listing_count" "area_type" "avg_pct_income_75k_plus" "avg_pct_units_built_2000_plus" "avg_pct_bachelors_or_higher"
reg_model_by_all <- lm(median_listing_price_per_square_foot ~ .-cbsa_title-log_median_price_sqft,
data = train)
summary(reg_model_by_all)
##
## Call:
## lm(formula = median_listing_price_per_square_foot ~ . - cbsa_title -
## log_median_price_sqft, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -219.7 -32.1 -2.1 26.2 556.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8845.9248917 50189.5044941 -0.18 0.86010
## total_population 0.0000709 0.0000119 5.98 2.4e-09 ***
## pct_female 2.4375101 0.9224434 2.64 0.00826 **
## pct_white -3.3909536 0.4911432 -6.90 5.7e-12 ***
## pct_black -3.8852373 0.5029785 -7.72 1.3e-14 ***
## pct_aian -3.5247886 0.5352760 -6.58 5.0e-11 ***
## pct_asian -2.8157779 0.6672297 -4.22 2.5e-05 ***
## pct_nhpi 22.4770681 1.6719943 13.44 < 2e-16 ***
## pct_two_or_more -4.0802497 0.3878300 -10.52 < 2e-16 ***
## pct_hispanic 0.4671647 0.2961200 1.58 0.11471
## median_age 2.4508771 0.4051798 6.05 1.6e-09 ***
## pct_under_18 -4.1705049 0.6532789 -6.38 1.9e-10 ***
## total_households -0.0009366 0.0001044 -8.97 < 2e-16 ***
## pct_married_couple -4.4202959 0.6194868 -7.14 1.1e-12 ***
## pct_male_householder -8.5572505 0.7520751 -11.38 < 2e-16 ***
## pct_female_householder -3.7096113 0.7323602 -5.07 4.2e-07 ***
## pct_unemployed -7.5248864 1.2340013 -6.10 1.2e-09 ***
## pct_below_poverty -0.8776696 0.5437292 -1.61 0.10655
## pct_income_10_14999 1.2672191 1.1189377 1.13 0.25747
## pct_income_15_24999 6.6273718 1.0104158 6.56 5.9e-11 ***
## pct_income_25_34999 3.5914555 1.0064759 3.57 0.00036 ***
## pct_income_35_49999 4.5561182 0.8579653 5.31 1.1e-07 ***
## pct_income_50_74999 3.6882698 0.8404993 4.39 1.2e-05 ***
## pct_income_75_99999 0.0894807 1.0469556 0.09 0.93189
## pct_income_100_149k 4.7500064 1.0177460 4.67 3.1e-06 ***
## pct_income_150_199k 1.9035372 1.1177579 1.70 0.08863 .
## pct_income_200k_plus 8.3107327 1.2237206 6.79 1.2e-11 ***
## median_household_income 0.0042429 0.0003796 11.18 < 2e-16 ***
## pct_high_school_plus -0.0673494 0.4530865 -0.15 0.88184
## pct_bachelors_plus -0.6412146 0.2524776 -2.54 0.01112 *
## total_housing_units 0.0007263 0.0000966 7.52 6.4e-14 ***
## pct_vacant_units 3.8821105 0.2255146 17.21 < 2e-16 ***
## pct_owner_occupied -3.9518493 0.2559362 -15.44 < 2e-16 ***
## pct_built_2020_plus 1.7264530 0.6248400 2.76 0.00575 **
## pct_built_2010_2019 0.6467341 0.3729643 1.73 0.08297 .
## pct_built_2000_2009 1.7104181 0.3833736 4.46 8.3e-06 ***
## pct_built_1990_1999 5.4986155 0.3622425 15.18 < 2e-16 ***
## pct_built_1980_1989 0.1075156 0.3409086 0.32 0.75249
## pct_built_1970_1979 4.4634395 0.3294739 13.55 < 2e-16 ***
## pct_built_1960_1969 3.2606558 0.4481913 7.28 4.0e-13 ***
## pct_built_1950_1959 1.4099698 0.4856007 2.90 0.00371 **
## pct_built_1940_1949 1.2691494 0.7324796 1.73 0.08321 .
## pct_1_bed -2.9346380 0.9385550 -3.13 0.00178 **
## pct_2_bed -6.0047784 0.7778107 -7.72 1.4e-14 ***
## pct_3_bed -5.7643616 0.7913148 -7.28 3.7e-13 ***
## pct_4_bed -10.1503354 0.8250897 -12.30 < 2e-16 ***
## pct_5plus_bed -4.8878482 0.8743573 -5.59 2.4e-08 ***
## area_typeMicro 20.8463748 2.4677002 8.45 < 2e-16 ***
## month_date_yyyymm 0.0479433 0.2479423 0.19 0.84668
## total_listing_count -0.0058566 0.0005451 -10.74 < 2e-16 ***
## pending_ratio -12.2510452 3.1841257 -3.85 0.00012 ***
## median_days_on_market 0.3178351 0.0642984 4.94 7.9e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59.7 on 5181 degrees of freedom
## Multiple R-squared: 0.789, Adjusted R-squared: 0.787
## F-statistic: 380 on 51 and 5181 DF, p-value: <2e-16
Have to eliminate cbsa_title or we would overfit
# names(full_dataset_clean)
#reg d- "avg_log_median_listing_price" i- "total_active_listing_count"
reg_model_log_by_1 <- lm(log_median_price_sqft ~ total_listing_count,
data = train)
summary(reg_model_log_by_1)
##
## Call:
## lm(formula = log_median_price_sqft ~ total_listing_count, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.169 -0.274 -0.046 0.202 1.893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.24253933 0.00624529 839.4 <2e-16 ***
## total_listing_count 0.00001945 0.00000119 16.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.414 on 5231 degrees of freedom
## Multiple R-squared: 0.0486, Adjusted R-squared: 0.0485
## F-statistic: 267 on 1 and 5231 DF, p-value: <2e-16
Nearly double the multiple R-squared.
#reg d- "avg_log_median_listing_price" i- "total_active_listing_count" "area_type" "avg_pct_income_75k_plus" "avg_pct_units_built_2000_plus" "avg_pct_bachelors_or_higher"
reg_model_log_by_all <- lm(log_median_price_sqft ~ . -cbsa_title -median_listing_price_per_square_foot,
data = train)
summary(reg_model_log_by_all)
##
## Call:
## lm(formula = log_median_price_sqft ~ . - cbsa_title - median_listing_price_per_square_foot,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6310 -0.1124 -0.0033 0.1109 0.8750
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -263.9506517811 147.1815778591 -1.79 0.07297 .
## total_population 0.0000000435 0.0000000348 1.25 0.21117
## pct_female 0.0216150690 0.0027050809 7.99 1.6e-15 ***
## pct_white -0.0183854352 0.0014402858 -12.77 < 2e-16 ***
## pct_black -0.0220597433 0.0014749931 -14.96 < 2e-16 ***
## pct_aian -0.0154160110 0.0015697061 -9.82 < 2e-16 ***
## pct_asian -0.0197734811 0.0019566624 -10.11 < 2e-16 ***
## pct_nhpi 0.0209086486 0.0049031519 4.26 2.0e-05 ***
## pct_two_or_more -0.0152579070 0.0011373182 -13.42 < 2e-16 ***
## pct_hispanic -0.0017765348 0.0008683769 -2.05 0.04083 *
## median_age 0.0083844151 0.0011881966 7.06 1.9e-12 ***
## pct_under_18 -0.0222256671 0.0019157515 -11.60 < 2e-16 ***
## total_households -0.0000026152 0.0000003060 -8.55 < 2e-16 ***
## pct_married_couple -0.0138774853 0.0018166557 -7.64 2.6e-14 ***
## pct_male_householder -0.0281977982 0.0022054730 -12.79 < 2e-16 ***
## pct_female_householder -0.0152457556 0.0021476587 -7.10 1.4e-12 ***
## pct_unemployed -0.0044214173 0.0036187298 -1.22 0.22183
## pct_below_poverty -0.0064797076 0.0015944951 -4.06 4.9e-05 ***
## pct_income_10_14999 0.0156180661 0.0032813040 4.76 2.0e-06 ***
## pct_income_15_24999 0.0178654188 0.0029630616 6.03 1.8e-09 ***
## pct_income_25_34999 0.0169949240 0.0029515076 5.76 9.0e-09 ***
## pct_income_35_49999 0.0202519328 0.0025159978 8.05 1.0e-15 ***
## pct_income_50_74999 0.0187283084 0.0024647786 7.60 3.5e-14 ***
## pct_income_75_99999 0.0203351809 0.0030702150 6.62 3.9e-11 ***
## pct_income_100_149k 0.0353523476 0.0029845575 11.85 < 2e-16 ***
## pct_income_150_199k 0.0409031617 0.0032778441 12.48 < 2e-16 ***
## pct_income_200k_plus 0.0521975589 0.0035885816 14.55 < 2e-16 ***
## median_household_income 0.0000004796 0.0000011133 0.43 0.66666
## pct_high_school_plus 0.0002452845 0.0013286839 0.18 0.85354
## pct_bachelors_plus -0.0012255201 0.0007403949 -1.66 0.09794 .
## total_housing_units 0.0000024487 0.0000002832 8.65 < 2e-16 ***
## pct_vacant_units 0.0084538688 0.0006613254 12.78 < 2e-16 ***
## pct_owner_occupied -0.0133510620 0.0007505373 -17.79 < 2e-16 ***
## pct_built_2020_plus 0.0182040448 0.0018323538 9.93 < 2e-16 ***
## pct_built_2010_2019 0.0025345517 0.0010937243 2.32 0.02052 *
## pct_built_2000_2009 0.0079883490 0.0011242495 7.11 1.4e-12 ***
## pct_built_1990_1999 0.0201886413 0.0010622824 19.00 < 2e-16 ***
## pct_built_1980_1989 0.0061704866 0.0009997202 6.17 7.2e-10 ***
## pct_built_1970_1979 0.0167673307 0.0009661879 17.35 < 2e-16 ***
## pct_built_1960_1969 0.0052036062 0.0013143285 3.96 7.6e-05 ***
## pct_built_1950_1959 0.0049729331 0.0014240322 3.49 0.00048 ***
## pct_built_1940_1949 0.0079898015 0.0021480089 3.72 0.00020 ***
## pct_1_bed -0.0110120715 0.0027523286 -4.00 6.4e-05 ***
## pct_2_bed -0.0158731141 0.0022809431 -6.96 3.9e-12 ***
## pct_3_bed -0.0182188726 0.0023205442 -7.85 5.0e-15 ***
## pct_4_bed -0.0278601810 0.0024195897 -11.51 < 2e-16 ***
## pct_5plus_bed -0.0139748305 0.0025640677 -5.45 5.3e-08 ***
## area_typeMicro 0.0745508272 0.0072365730 10.30 < 2e-16 ***
## month_date_yyyymm 0.0013381116 0.0007270949 1.84 0.06577 .
## total_listing_count -0.0000204797 0.0000015986 -12.81 < 2e-16 ***
## pending_ratio -0.0504287754 0.0093375029 -5.40 6.9e-08 ***
## median_days_on_market 0.0001558163 0.0001885562 0.83 0.40864
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.175 on 5181 degrees of freedom
## Multiple R-squared: 0.832, Adjusted R-squared: 0.83
## F-statistic: 502 on 51 and 5181 DF, p-value: <2e-16
Have to eliminate cbsa_title or we would overfit
Slightly better fit (also around 2% better, coincidentally)
cv_lasso_price = cv.glmnet(X_train_matrix, price_train, alpha = 1)
lasso_price_best_lambda = cv_lasso_price$lambda.min
lasso_price_best_lambda
## [1] 0.0221
plot(cv_lasso_price)
lasso_price_best_model = glmnet(X_train_matrix, price_train, alpha = 1, lambda=lasso_price_best_lambda)
coef(lasso_price_best_model)
## 581 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -2.83e+04
## total_population 6.15e-06
## pct_female 1.25e+00
## pct_white -1.51e+00
## pct_black -2.02e+00
## pct_aian .
## pct_asian 7.66e+00
## pct_nhpi 1.05e+01
## pct_two_or_more .
## pct_hispanic 8.87e-02
## median_age 3.75e+00
## pct_under_18 -1.34e+00
## total_households 2.44e-07
## pct_married_couple 4.18e-01
## pct_male_householder -2.97e+00
## pct_female_householder -2.60e-02
## pct_unemployed 1.11e+00
## pct_below_poverty -3.13e+00
## pct_income_10_14999 -1.68e+00
## pct_income_15_24999 -5.95e+00
## pct_income_25_34999 -2.96e+00
## pct_income_35_49999 -2.11e+00
## pct_income_50_74999 -3.41e+00
## pct_income_75_99999 -2.95e+00
## pct_income_100_149k 1.15e+00
## pct_income_150_199k 7.11e-02
## pct_income_200k_plus 1.28e+00
## median_household_income .
## pct_high_school_plus .
## pct_bachelors_plus .
## total_housing_units -1.69e-05
## pct_vacant_units 2.63e+00
## pct_owner_occupied -1.76e+00
## pct_built_2020_plus 1.58e+00
## pct_built_2010_2019 1.00e-02
## pct_built_2000_2009 7.01e-01
## pct_built_1990_1999 1.59e+00
## pct_built_1980_1989 -7.73e-03
## pct_built_1970_1979 2.33e+00
## pct_built_1960_1969 .
## pct_built_1950_1959 .
## pct_built_1940_1949 -1.66e+00
## pct_1_bed 2.36e-01
## pct_2_bed 1.91e+00
## pct_3_bed -4.16e-02
## pct_4_bed -3.92e+00
## pct_5plus_bed 1.16e+00
## area_typeMetro -6.55e+00
## area_typeMicro 8.18e-14
## month_date_yyyymm 1.42e-01
## cbsa_titleAbilene, TX -6.82e-01
## cbsa_titleAdrian, MI 3.75e+00
## cbsa_titleAkron, OH -2.99e+01
## cbsa_titleAlamogordo, NM -6.33e+00
## cbsa_titleAlbany-Schenectady-Troy, NY -9.66e+00
## cbsa_titleAlbany, GA -3.53e+01
## cbsa_titleAlbany, OR 8.39e+01
## cbsa_titleAlbemarle, NC 3.32e+01
## cbsa_titleAlbertville, AL 1.34e+01
## cbsa_titleAlbuquerque, NM -2.00e+01
## cbsa_titleAlexandria, LA -3.12e+00
## cbsa_titleAllentown-Bethlehem-Easton, PA-NJ -1.11e+01
## cbsa_titleAltoona, PA -5.98e+01
## cbsa_titleAmarillo, TX -3.50e+01
## cbsa_titleAmes, IA 3.91e+01
## cbsa_titleAmherst Town-Northampton, MA 3.62e+01
## cbsa_titleAnchorage, AK -1.13e+02
## cbsa_titleAnderson Creek, NC 3.53e+01
## cbsa_titleAnn Arbor, MI -4.31e+01
## cbsa_titleAnniston-Oxford, AL -2.82e+01
## cbsa_titleAppleton, WI 1.71e+01
## cbsa_titleAsheville, NC 4.33e+01
## cbsa_titleAthens-Clarke County, GA 5.27e+00
## cbsa_titleAthens, TN .
## cbsa_titleAthens, TX -6.67e+00
## cbsa_titleAtlanta-Sandy Springs-Roswell, GA -5.70e+01
## cbsa_titleAtlantic City-Hammonton, NJ 6.75e+01
## cbsa_titleAuburn-Opelika, AL 1.15e+00
## cbsa_titleAuburn, NY -2.26e+01
## cbsa_titleAugusta-Richmond County, GA-SC -1.79e-01
## cbsa_titleAugusta-Waterville, ME 3.04e+00
## cbsa_titleAustin-Round Rock-San Marcos, TX -5.68e+01
## cbsa_titleBakersfield-Delano, CA 5.88e-01
## cbsa_titleBaltimore-Columbia-Towson, MD -3.69e+01
## cbsa_titleBangor, ME -1.52e+01
## cbsa_titleBaraboo, WI -1.88e+01
## cbsa_titleBarnstable Town, MA 1.98e+02
## cbsa_titleBaton Rouge, LA 2.25e+01
## cbsa_titleBattle Creek, MI 9.40e+00
## cbsa_titleBay City, MI 9.22e+00
## cbsa_titleBeaumont-Port Arthur, TX -4.48e+01
## cbsa_titleBeaver Dam, WI -6.11e+00
## cbsa_titleBeckley, WV 1.14e+01
## cbsa_titleBellingham, WA 7.94e+01
## cbsa_titleBend, OR 4.95e+01
## cbsa_titleBillings, MT 2.04e+01
## cbsa_titleBinghamton, NY -1.80e+01
## cbsa_titleBirmingham, AL -7.86e+00
## cbsa_titleBismarck, ND 1.47e+00
## cbsa_titleBlacksburg-Christiansburg-Radford, VA -2.77e+01
## cbsa_titleBloomington, IL -6.70e+01
## cbsa_titleBloomington, IN -1.86e+01
## cbsa_titleBloomsburg-Berwick, PA 2.08e+00
## cbsa_titleBluefield, WV-VA -3.46e+01
## cbsa_titleBoise City, ID 6.79e+01
## cbsa_titleBoston-Cambridge-Newton, MA-NH 1.16e+02
## cbsa_titleBoulder, CO 1.32e+02
## cbsa_titleBowling Green, KY -1.97e+01
## cbsa_titleBozeman, MT 2.15e+02
## cbsa_titleBrainerd, MN .
## cbsa_titleBremerton-Silverdale-Port Orchard, WA -2.86e+01
## cbsa_titleBridgeport-Stamford-Danbury, CT 7.70e+01
## cbsa_titleBrigham City, UT-ID 5.69e+01
## cbsa_titleBrownsville-Harlingen, TX -1.59e+01
## cbsa_titleBrunswick-St. Simons, GA 3.51e+01
## cbsa_titleBuffalo-Cheektowaga, NY 1.42e+00
## cbsa_titleBurlington-South Burlington, VT 4.42e+01
## cbsa_titleBurlington, NC -1.66e+01
## cbsa_titleCanton-Massillon, OH -1.88e+01
## cbsa_titleCape Coral-Fort Myers, FL -9.79e+01
## cbsa_titleCape Girardeau, MO-IL -4.04e+01
## cbsa_titleCarson City, NV -2.53e+01
## cbsa_titleCasper, WY 4.49e+01
## cbsa_titleCedar City, UT 4.31e+01
## cbsa_titleCedar Rapids, IA -7.62e+00
## cbsa_titleCentralia, WA 2.95e+01
## cbsa_titleChambersburg, PA .
## cbsa_titleChampaign-Urbana, IL -8.71e+01
## cbsa_titleCharleston-North Charleston, SC 5.82e+01
## cbsa_titleCharleston, WV -5.64e+01
## cbsa_titleCharlotte-Concord-Gastonia, NC-SC -2.29e+01
## cbsa_titleCharlottesville, VA -1.66e+00
## cbsa_titleChattanooga, TN-GA 2.12e+01
## cbsa_titleCheyenne, WY 5.20e+00
## cbsa_titleChicago-Naperville-Elgin, IL-IN -7.59e+01
## cbsa_titleChico, CA 4.49e+00
## cbsa_titleChillicothe, OH .
## cbsa_titleCincinnati, OH-KY-IN 4.40e+00
## cbsa_titleClarksburg, WV -1.58e+01
## cbsa_titleClarksville, TN-KY 9.85e+00
## cbsa_titleClearlake, CA 2.91e+01
## cbsa_titleCleveland, OH -1.17e+01
## cbsa_titleCleveland, TN -2.99e+01
## cbsa_titleClovis, NM -7.66e+00
## cbsa_titleCoeur d'Alene, ID 1.09e+02
## cbsa_titleCollege Station-Bryan, TX 6.63e+00
## cbsa_titleColorado Springs, CO 1.21e+00
## cbsa_titleColumbia, MO -1.75e+01
## cbsa_titleColumbia, SC 4.74e+00
## cbsa_titleColumbus, GA-AL -8.43e+00
## cbsa_titleColumbus, IN -7.08e+01
## cbsa_titleColumbus, MS -6.69e+00
## cbsa_titleColumbus, OH -2.45e+01
## cbsa_titleConcord, NH 5.41e-01
## cbsa_titleCookeville, TN 3.20e+01
## cbsa_titleCorbin, KY 2.19e+01
## cbsa_titleCorning, NY -6.14e+01
## cbsa_titleCorpus Christi, TX -2.27e+01
## cbsa_titleCorvallis, OR 3.77e+01
## cbsa_titleCrestview-Fort Walton Beach-Destin, FL 1.94e+02
## cbsa_titleCrossville, TN -2.73e+01
## cbsa_titleCullman, AL 8.63e-01
## cbsa_titleCumberland, MD-WV -2.08e+01
## cbsa_titleDallas-Fort Worth-Arlington, TX -8.93e+01
## cbsa_titleDalton, GA 2.43e+01
## cbsa_titleDanville, IL -2.71e+01
## cbsa_titleDanville, VA -3.97e+01
## cbsa_titleDaphne-Fairhope-Foley, AL 1.06e+02
## cbsa_titleDavenport-Moline-Rock Island, IA-IL -3.43e+01
## cbsa_titleDayton-Kettering-Beavercreek, OH -1.50e+01
## cbsa_titleDecatur, AL -2.97e+01
## cbsa_titleDecatur, IL -4.95e+01
## cbsa_titleDeltona-Daytona Beach-Ormond Beach, FL -1.97e+01
## cbsa_titleDenver-Aurora-Centennial, CO 9.70e-01
## cbsa_titleDes Moines-West Des Moines, IA 1.60e+01
## cbsa_titleDetroit-Warren-Dearborn, MI -3.87e+00
## cbsa_titleDothan, AL -6.07e+00
## cbsa_titleDover, DE 1.34e+01
## cbsa_titleDuBois, PA -1.40e+00
## cbsa_titleDubuque, IA 1.86e+01
## cbsa_titleDuluth, MN-WI 1.61e+01
## cbsa_titleDurham-Chapel Hill, NC -1.65e+01
## cbsa_titleEagle Pass, TX .
## cbsa_titleEast Stroudsburg, PA -2.77e+01
## cbsa_titleEau Claire, WI -1.19e+01
## cbsa_titleEl Centro, CA -3.05e-01
## cbsa_titleEl Paso, TX 9.07e-01
## cbsa_titleElizabethtown, KY -7.50e+00
## cbsa_titleElkhart-Goshen, IN -7.59e+00
## cbsa_titleElmira, NY -1.62e+01
## cbsa_titleEnid, OK -7.87e+01
## cbsa_titleErie, PA 1.65e+01
## cbsa_titleEugene-Springfield, OR 6.45e+01
## cbsa_titleEureka-Arcata, CA 8.27e+01
## cbsa_titleEvansville, IN -3.13e+01
## cbsa_titleFairbanks-College, AK -1.03e+02
## cbsa_titleFargo, ND-MN -1.07e+01
## cbsa_titleFaribault-Northfield, MN 1.92e+01
## cbsa_titleFarmington, MO -4.82e+00
## cbsa_titleFarmington, NM -2.33e+01
## cbsa_titleFayetteville-Springdale-Rogers, AR -5.89e+01
## cbsa_titleFayetteville, NC 2.74e+01
## cbsa_titleFindlay, OH -6.94e+01
## cbsa_titleFlagstaff, AZ 1.02e+02
## cbsa_titleFlint, MI 3.88e+00
## cbsa_titleFlorence-Muscle Shoals, AL -1.99e+01
## cbsa_titleFlorence, SC 6.87e+00
## cbsa_titleFond du Lac, WI -2.95e+00
## cbsa_titleForest City, NC 1.96e+01
## cbsa_titleFort Collins-Loveland, CO 9.85e+00
## cbsa_titleFort Payne, AL 5.74e+01
## cbsa_titleFort Smith, AR-OK -4.01e+01
## cbsa_titleFort Wayne, IN 3.61e+00
## cbsa_titleFrankfort, KY -8.43e+00
## cbsa_titleFresno, CA -4.12e+01
## cbsa_titleGadsden, AL 1.23e+01
## cbsa_titleGainesville, FL 2.11e+01
## cbsa_titleGainesville, GA -2.27e+01
## cbsa_titleGallup, NM -6.97e+01
## cbsa_titleGettysburg, PA -1.01e+00
## cbsa_titleGlens Falls, NY 2.25e+01
## cbsa_titleGoldsboro, NC 3.75e+01
## cbsa_titleGranbury, TX .
## cbsa_titleGrand Forks, ND-MN -3.96e+01
## cbsa_titleGrand Island, NE -7.03e+01
## cbsa_titleGrand Junction, CO 8.06e+01
## cbsa_titleGrand Rapids-Wyoming-Kentwood, MI 2.14e+01
## cbsa_titleGrants Pass, OR 4.88e+01
## cbsa_titleGreat Falls, MT 5.99e+01
## cbsa_titleGreeley, CO -3.93e+00
## cbsa_titleGreen Bay, WI 6.57e+00
## cbsa_titleGreeneville, TN 4.43e+01
## cbsa_titleGreenfield, MA .
## cbsa_titleGreensboro-High Point, NC -1.56e+01
## cbsa_titleGreenville-Anderson-Greer, SC -3.28e+00
## cbsa_titleGreenville, NC -1.95e+01
## cbsa_titleGreenwood, SC 5.43e+01
## cbsa_titleGulfport-Biloxi, MS .
## cbsa_titleHagerstown-Martinsburg, MD-WV 1.70e+01
## cbsa_titleHammond, LA 6.22e+01
## cbsa_titleHanford-Corcoran, CA -9.29e+00
## cbsa_titleHarrisburg-Carlisle, PA -5.14e+01
## cbsa_titleHarrisonburg, VA 3.24e+01
## cbsa_titleHartford-West Hartford-East Hartford, CT -3.26e+01
## cbsa_titleHattiesburg, MS 3.10e+01
## cbsa_titleHeber, UT 2.26e+02
## cbsa_titleHelena, MT 2.93e+01
## cbsa_titleHermiston-Pendleton, OR 7.99e+00
## cbsa_titleHermitage, PA 1.96e+01
## cbsa_titleHickory-Lenoir-Morganton, NC -6.87e+00
## cbsa_titleHilo-Kailua, HI -1.31e+02
## cbsa_titleHilton Head Island-Bluffton-Port Royal, SC 5.71e+01
## cbsa_titleHinesville, GA 1.55e+01
## cbsa_titleHobbs, NM -6.19e+01
## cbsa_titleHolland, MI 2.78e+01
## cbsa_titleHomosassa Springs, FL -1.08e+02
## cbsa_titleHot Springs, AR -1.93e+01
## cbsa_titleHouma-Bayou Cane-Thibodaux, LA -9.99e-01
## cbsa_titleHouston-Pasadena-The Woodlands, TX -9.21e+01
## cbsa_titleHuntington-Ashland, WV-KY-OH -3.32e+01
## cbsa_titleHuntsville, AL 1.99e+00
## cbsa_titleHuntsville, TX .
## cbsa_titleIdaho Falls, ID -2.44e+01
## cbsa_titleIndiana, PA -4.80e+01
## cbsa_titleIndianapolis-Carmel-Greenwood, IN -4.30e+01
## cbsa_titleIowa City, IA -3.14e+01
## cbsa_titleIthaca, NY -5.12e+01
## cbsa_titleJackson, MI -8.43e+00
## cbsa_titleJackson, MS 2.42e+01
## cbsa_titleJackson, TN 6.74e+01
## cbsa_titleJacksonville, FL -1.71e+00
## cbsa_titleJacksonville, NC 2.95e+01
## cbsa_titleJamestown-Dunkirk, NY -1.73e+00
## cbsa_titleJanesville-Beloit, WI 1.50e+01
## cbsa_titleJefferson City, MO -2.83e+00
## cbsa_titleJefferson, GA -2.33e+01
## cbsa_titleJohnson City, TN 3.64e+01
## cbsa_titleJohnstown, PA -1.54e+01
## cbsa_titleJonesboro, AR -2.41e+00
## cbsa_titleJoplin, MO-KS -9.68e+00
## cbsa_titleKahului-Wailuku, HI 4.13e+02
## cbsa_titleKalamazoo-Portage, MI -1.01e+01
## cbsa_titleKalispell, MT 1.95e+02
## cbsa_titleKankakee, IL -4.06e+01
## cbsa_titleKansas City, MO-KS -3.31e+00
## cbsa_titleKapaa, HI 2.88e+02
## cbsa_titleKeene, NH -2.25e+01
## cbsa_titleKennewick-Richland, WA 4.62e+01
## cbsa_titleKenosha, WI -2.10e+01
## cbsa_titleKey West-Key Largo, FL 4.56e+02
## cbsa_titleKilleen-Temple, TX 2.42e+01
## cbsa_titleKingsport-Bristol, TN-VA -9.53e+00
## cbsa_titleKingston, NY 4.53e+01
## cbsa_titleKiryas Joel-Poughkeepsie-Newburgh, NY 1.68e+01
## cbsa_titleKlamath Falls, OR -5.19e+00
## cbsa_titleKnoxville, TN 4.22e+01
## cbsa_titleKokomo, IN -3.01e+01
## cbsa_titleLa Crosse-Onalaska, WI-MN -1.16e+01
## cbsa_titleLaconia, NH -2.40e+01
## cbsa_titleLafayette-West Lafayette, IN 1.42e+01
## cbsa_titleLafayette, LA 1.21e+01
## cbsa_titleLaGrange, GA-AL 1.92e+01
## cbsa_titleLake Charles, LA -1.26e+01
## cbsa_titleLake City, FL 3.43e+01
## cbsa_titleLake Havasu City-Kingman, AZ -1.40e+00
## cbsa_titleLakeland-Winter Haven, FL -1.25e+01
## cbsa_titleLancaster, PA 1.41e+00
## cbsa_titleLansing-East Lansing, MI -5.05e+01
## cbsa_titleLaredo, TX -2.40e+01
## cbsa_titleLas Cruces, NM -9.52e+00
## cbsa_titleLas Vegas-Henderson-North Las Vegas, NV -8.45e+01
## cbsa_titleLaurel, MS 3.41e+01
## cbsa_titleLawrence, KS .
## cbsa_titleLawton, OK -5.87e+01
## cbsa_titleLebanon-Claremont, NH-VT 1.38e+00
## cbsa_titleLebanon, PA 2.86e+01
## cbsa_titleLewiston-Auburn, ME 5.00e+01
## cbsa_titleLewiston, ID-WA 3.93e+01
## cbsa_titleLexington Park, MD -2.12e+01
## cbsa_titleLexington-Fayette, KY -3.97e+00
## cbsa_titleLima, OH -1.24e+01
## cbsa_titleLincoln, NE -1.71e+01
## cbsa_titleLittle Rock-North Little Rock-Conway, AR -9.29e+00
## cbsa_titleLogan, UT-ID 4.63e+01
## cbsa_titleLongview-Kelso, WA 6.50e+01
## cbsa_titleLongview, TX -6.15e+00
## cbsa_titleLos Angeles-Long Beach-Anaheim, CA 2.14e+02
## cbsa_titleLouisville/Jefferson County, KY-IN -1.37e+01
## cbsa_titleLubbock, TX -4.75e+01
## cbsa_titleLufkin, TX -1.68e+01
## cbsa_titleLumberton, NC -1.34e+01
## cbsa_titleLynchburg, VA 2.00e+01
## cbsa_titleMacon-Bibb County, GA -2.73e+01
## cbsa_titleMadison, WI -3.58e+01
## cbsa_titleManchester-Nashua, NH -1.27e+01
## cbsa_titleManhattan, KS -6.19e+01
## cbsa_titleManitowoc, WI -8.96e+00
## cbsa_titleMankato, MN -1.48e+01
## cbsa_titleMansfield, OH 4.62e+00
## cbsa_titleMarinette, WI-MI -6.26e+01
## cbsa_titleMarion-Herrin, IL -8.83e+01
## cbsa_titleMarion, IN 5.05e+01
## cbsa_titleMarquette, MI -1.40e+00
## cbsa_titleMassena-Ogdensburg, NY -2.64e+01
## cbsa_titleMcAllen-Edinburg-Mission, TX -3.43e+01
## cbsa_titleMeadville, PA -2.67e+01
## cbsa_titleMedford, OR 4.58e+01
## cbsa_titleMemphis, TN-MS-AR 2.06e+01
## cbsa_titleMerced, CA 2.02e+01
## cbsa_titleMeridian, MS -4.68e+01
## cbsa_titleMiami-Fort Lauderdale-West Palm Beach, FL 4.47e+01
## cbsa_titleMichigan City-La Porte, IN .
## cbsa_titleMidland, MI 1.78e+01
## cbsa_titleMidland, TX -1.72e+01
## cbsa_titleMilwaukee-Waukesha, WI -1.42e+01
## cbsa_titleMinneapolis-St. Paul-Bloomington, MN-WI -4.98e+01
## cbsa_titleMinot, ND -3.70e+01
## cbsa_titleMissoula, MT 1.21e+02
## cbsa_titleMobile, AL 2.63e+01
## cbsa_titleModesto, CA 5.09e+00
## cbsa_titleMonroe, LA 9.42e+00
## cbsa_titleMonroe, MI 5.86e+01
## cbsa_titleMontgomery, AL 1.35e+01
## cbsa_titleMonticello, NY 1.02e+01
## cbsa_titleMorehead City, NC 2.26e+01
## cbsa_titleMorgantown, WV -1.59e+01
## cbsa_titleMorristown, TN 5.37e+01
## cbsa_titleMoses Lake, WA 3.21e+01
## cbsa_titleMount Airy, NC 1.44e+00
## cbsa_titleMount Pleasant, MI -1.77e+01
## cbsa_titleMount Vernon-Anacortes, WA 5.20e+01
## cbsa_titleMuncie, IN -2.30e+01
## cbsa_titleMurrells Inlet, SC 4.54e+01
## cbsa_titleMuskegon-Norton Shores, MI 7.37e+01
## cbsa_titleMuskogee, OK -5.73e+01
## cbsa_titleMyrtle Beach-Conway-North Myrtle Beach, SC -1.74e+01
## cbsa_titleNacogdoches, TX -1.05e+00
## cbsa_titleNapa, CA 3.02e+02
## cbsa_titleNaples-Marco Island, FL -1.76e-01
## cbsa_titleNashville-Davidson--Murfreesboro--Franklin, TN 2.64e+01
## cbsa_titleNew Bern, NC 1.52e+01
## cbsa_titleNew Haven, CT -1.01e+01
## cbsa_titleNew Iberia, LA 6.66e+00
## cbsa_titleNew Orleans-Metairie, LA 1.19e+01
## cbsa_titleNew Philadelphia-Dover, OH -2.31e+01
## cbsa_titleNew York-Newark-Jersey City, NY-NJ 1.75e+02
## cbsa_titleNiles, MI 3.58e+00
## cbsa_titleNorth Port-Bradenton-Sarasota, FL -8.85e+01
## cbsa_titleNorth Wilkesboro, NC 3.59e+01
## cbsa_titleNorwich-New London-Willimantic, CT 3.64e+00
## cbsa_titleOak Harbor, WA 2.17e+01
## cbsa_titleOcala, FL -6.74e+01
## cbsa_titleOdessa, TX -1.72e+01
## cbsa_titleOgden, UT 2.29e+00
## cbsa_titleOklahoma City, OK -1.67e+01
## cbsa_titleOlean, NY -3.59e+00
## cbsa_titleOlympia-Lacey-Tumwater, WA -3.68e+01
## cbsa_titleOmaha, NE-IA -2.27e+01
## cbsa_titleOpelousas, LA 4.97e+00
## cbsa_titleOrangeburg, SC -2.79e+01
## cbsa_titleOrlando-Kissimmee-Sanford, FL -2.31e+01
## cbsa_titleOshkosh-Neenah, WI -7.08e+00
## cbsa_titleOttawa, IL -3.43e+01
## cbsa_titleOwensboro, KY 1.25e+01
## cbsa_titleOwosso, MI -1.31e+01
## cbsa_titleOxford, MS -1.83e-01
## cbsa_titleOxnard-Thousand Oaks-Ventura, CA 1.80e+02
## cbsa_titlePaducah, KY-IL -8.32e+01
## cbsa_titlePalatka, FL 5.21e+01
## cbsa_titlePalm Bay-Melbourne-Titusville, FL 2.66e-01
## cbsa_titlePanama City-Panama City Beach, FL 4.79e+01
## cbsa_titleParkersburg-Vienna, WV -3.04e+01
## cbsa_titlePensacola-Ferry Pass-Brent, FL 2.92e+01
## cbsa_titlePeoria, IL -6.53e+01
## cbsa_titlePhiladelphia-Camden-Wilmington, PA-NJ-DE-MD .
## cbsa_titlePhoenix-Mesa-Chandler, AZ -4.38e+00
## cbsa_titlePikeville, KY -2.28e+01
## cbsa_titlePine Bluff, AR -3.44e+01
## cbsa_titlePinehurst-Southern Pines, NC 2.02e+01
## cbsa_titlePittsburgh, PA -1.78e+01
## cbsa_titlePittsfield, MA 5.91e+01
## cbsa_titlePlattsburgh, NY -3.59e+01
## cbsa_titlePocatello, ID 2.02e+01
## cbsa_titlePort Angeles, WA 3.91e+00
## cbsa_titlePort St. Lucie, FL -3.80e+01
## cbsa_titlePortland-South Portland, ME 9.67e+01
## cbsa_titlePortland-Vancouver-Hillsboro, OR-WA -1.85e+01
## cbsa_titlePortsmouth, OH .
## cbsa_titlePottsville, PA -4.22e+01
## cbsa_titlePrescott Valley-Prescott, AZ 1.79e+01
## cbsa_titleProvidence-Warwick, RI-MA 6.20e+01
## cbsa_titleProvo-Orem-Lehi, UT -3.49e+00
## cbsa_titlePueblo, CO 4.36e+00
## cbsa_titlePunta Gorda, FL -1.56e+02
## cbsa_titlePutnam, CT 1.75e+01
## cbsa_titleQuincy, IL-MO -4.66e+01
## cbsa_titleRacine-Mount Pleasant, WI 2.42e+01
## cbsa_titleRaleigh-Cary, NC -6.82e+01
## cbsa_titleRapid City, SD 3.80e+01
## cbsa_titleReading, PA -8.44e-01
## cbsa_titleRedding, CA -3.47e-01
## cbsa_titleReno, NV 3.18e+00
## cbsa_titleRexburg, ID 8.90e+00
## cbsa_titleRichmond-Berea, KY 1.43e+01
## cbsa_titleRichmond, IN 3.69e+01
## cbsa_titleRichmond, VA 1.14e+01
## cbsa_titleRifle, CO 6.18e+02
## cbsa_titleRio Grande City-Roma, TX .
## cbsa_titleRiverside-San Bernardino-Ontario, CA 1.11e+00
## cbsa_titleRoanoke Rapids, NC -3.72e+01
## cbsa_titleRoanoke, VA -1.85e+01
## cbsa_titleRochester, MN -2.37e+01
## cbsa_titleRochester, NY -2.31e+01
## cbsa_titleRockford, IL -5.95e+01
## cbsa_titleRocky Mount, NC 2.72e+01
## cbsa_titleRome, GA 8.93e+00
## cbsa_titleRoseburg, OR 6.08e+01
## cbsa_titleRussellville, AR -4.11e+01
## cbsa_titleSacramento-Roseville-Folsom, CA -6.38e+01
## cbsa_titleSaginaw, MI .
## cbsa_titleSalem, OH 2.55e+00
## cbsa_titleSalem, OR 3.54e-01
## cbsa_titleSalinas, CA 3.74e+02
## cbsa_titleSalisbury, MD 1.59e+01
## cbsa_titleSalt Lake City-Murray, UT -2.22e+01
## cbsa_titleSan Angelo, TX -5.51e+01
## cbsa_titleSan Antonio-New Braunfels, TX -4.80e+01
## cbsa_titleSan Diego-Chula Vista-Carlsbad, CA 1.78e+02
## cbsa_titleSan Francisco-Oakland-Fremont, CA 8.72e+01
## cbsa_titleSan Jose-Sunnyvale-Santa Clara, CA 1.11e+02
## cbsa_titleSan Luis Obispo-Paso Robles, CA 2.33e+02
## cbsa_titleSandusky, OH -1.34e+00
## cbsa_titleSanford, NC 1.39e+01
## cbsa_titleSanta Cruz-Watsonville, CA 3.84e+02
## cbsa_titleSanta Fe, NM 4.78e+01
## cbsa_titleSanta Maria-Santa Barbara, CA 6.04e+02
## cbsa_titleSanta Rosa-Petaluma, CA 1.74e+02
## cbsa_titleSavannah, GA 1.73e+01
## cbsa_titleScranton--Wilkes-Barre, PA -2.61e+01
## cbsa_titleSeaford, DE -5.33e+01
## cbsa_titleSearcy, AR .
## cbsa_titleSeattle-Tacoma-Bellevue, WA 3.28e+00
## cbsa_titleSebastian-Vero Beach-West Vero Corridor, FL -5.05e+01
## cbsa_titleSebring, FL -9.51e+01
## cbsa_titleSeneca, SC 8.66e+00
## cbsa_titleSevierville, TN 1.08e+02
## cbsa_titleShawnee, OK -1.74e+01
## cbsa_titleSheboygan, WI -3.78e+01
## cbsa_titleShelby-Kings Mountain, NC .
## cbsa_titleShelton, WA -2.41e+01
## cbsa_titleSherman-Denison, TX 1.84e+01
## cbsa_titleShow Low, AZ 6.67e+01
## cbsa_titleShreveport-Bossier City, LA 1.62e+01
## cbsa_titleSierra Vista-Douglas, AZ -2.04e+01
## cbsa_titleSioux City, IA-NE-SD -6.07e+01
## cbsa_titleSioux Falls, SD-MN 2.12e+01
## cbsa_titleSlidell-Mandeville-Covington, LA 2.31e+01
## cbsa_titleSomerset, KY 2.23e+01
## cbsa_titleSomerset, PA -2.91e+01
## cbsa_titleSouth Bend-Mishawaka, IN-MI 3.09e+00
## cbsa_titleSpartanburg, SC .
## cbsa_titleSpokane-Spokane Valley, WA 9.46e+00
## cbsa_titleSpringfield, IL -1.06e+02
## cbsa_titleSpringfield, MA 1.14e+01
## cbsa_titleSpringfield, MO -5.52e+00
## cbsa_titleSpringfield, OH -9.84e+00
## cbsa_titleSt. Cloud, MN .
## cbsa_titleSt. George, UT 6.09e+01
## cbsa_titleSt. Joseph, MO-KS -6.43e+01
## cbsa_titleSt. Louis, MO-IL -3.54e+01
## cbsa_titleState College, PA -3.32e+01
## cbsa_titleStatesboro, GA 8.61e+01
## cbsa_titleStaunton-Stuarts Draft, VA 2.27e+01
## cbsa_titleStevens Point-Plover, WI -6.57e+00
## cbsa_titleStillwater, OK -2.83e+01
## cbsa_titleStockton-Lodi, CA -1.03e+02
## cbsa_titleSumter, SC 4.03e+01
## cbsa_titleSunbury, PA -1.81e+01
## cbsa_titleSyracuse, NY -9.40e+00
## cbsa_titleTalladega-Sylacauga, AL 7.19e+00
## cbsa_titleTallahassee, FL 2.34e+01
## cbsa_titleTampa-St. Petersburg-Clearwater, FL -3.87e+01
## cbsa_titleTerre Haute, IN -1.80e+01
## cbsa_titleTexarkana, TX-AR -3.26e+00
## cbsa_titleThomasville, GA 4.39e+01
## cbsa_titleToledo, OH .
## cbsa_titleTopeka, KS -4.34e+01
## cbsa_titleTorrington, CT 4.63e+01
## cbsa_titleTraverse City, MI 5.03e+01
## cbsa_titleTrenton-Princeton, NJ -9.64e+01
## cbsa_titleTruckee-Grass Valley, CA -1.20e+01
## cbsa_titleTucson, AZ -1.45e+01
## cbsa_titleTullahoma-Manchester, TN 5.42e+01
## cbsa_titleTulsa, OK -3.46e+01
## cbsa_titleTupelo, MS 3.87e+01
## cbsa_titleTuscaloosa, AL 2.86e+01
## cbsa_titleTwin Falls, ID 7.13e+01
## cbsa_titleTyler, TX -3.28e+01
## cbsa_titleUkiah, CA 5.60e+01
## cbsa_titleUrban Honolulu, HI -1.46e+02
## cbsa_titleUtica-Rome, NY -6.92e+00
## cbsa_titleValdosta, GA 4.24e+01
## cbsa_titleVallejo, CA -5.93e+01
## cbsa_titleVictoria, TX -1.90e+01
## cbsa_titleVineland, NJ 2.06e+00
## cbsa_titleVirginia Beach-Chesapeake-Norfolk, VA-NC 1.30e+01
## cbsa_titleVisalia, CA .
## cbsa_titleWaco, TX 2.10e+01
## cbsa_titleWalla Walla, WA 1.00e+01
## cbsa_titleWarner Robins, GA -2.59e+01
## cbsa_titleWarsaw, IN -9.38e+00
## cbsa_titleWashington-Arlington-Alexandria, DC-VA-MD-WV -4.21e+01
## cbsa_titleWaterbury-Shelton, CT 2.41e+01
## cbsa_titleWaterloo-Cedar Falls, IA .
## cbsa_titleWatertown-Fort Atkinson, WI -3.80e+01
## cbsa_titleWatertown-Fort Drum, NY -4.49e+01
## cbsa_titleWausau, WI -7.01e+01
## cbsa_titleWeirton-Steubenville, WV-OH -1.91e+01
## cbsa_titleWenatchee-East Wenatchee, WA 7.03e+01
## cbsa_titleWheeling, WV-OH -1.08e+01
## cbsa_titleWhitewater-Elkhorn, WI -8.18e+00
## cbsa_titleWichita Falls, TX -5.90e+01
## cbsa_titleWichita, KS -3.74e+01
## cbsa_titleWildwood-The Villages, FL -1.30e+02
## cbsa_titleWilliamsport, PA 1.15e+00
## cbsa_titleWilmington, NC 9.79e+00
## cbsa_titleWilson, NC -8.82e+00
## cbsa_titleWinchester, VA-WV 2.24e+01
## cbsa_titleWinston-Salem, NC -9.55e+00
## cbsa_titleWisconsin Rapids-Marshfield, WI -4.40e+01
## cbsa_titleWooster, OH 1.61e+01
## cbsa_titleWorcester, MA 9.36e+00
## cbsa_titleYakima, WA 3.14e+01
## cbsa_titleYork-Hanover, PA 8.13e+00
## cbsa_titleYoungstown-Warren, OH 1.21e+01
## cbsa_titleYuba City, CA -8.36e+01
## cbsa_titleYuma, AZ 1.55e+00
## cbsa_titleZanesville, OH -7.27e+00
## total_listing_count -2.85e-05
## pending_ratio 3.82e-01
## median_days_on_market -2.27e-01
cv_lasso_price_no_cbsa = cv.glmnet(X_train_no_cbsa_matrix, price_train, alpha = 1)
lasso_price_no_cbsa_best_lambda = cv_lasso_price_no_cbsa$lambda.min
lasso_price_no_cbsa_best_lambda
## [1] 0.00871
plot(cv_lasso_price_no_cbsa)
lasso_price_no_cbsa_best_model = glmnet(X_train_no_cbsa_matrix, price_train, alpha = 1, lambda=lasso_price_no_cbsa_best_lambda)
coef(lasso_price_no_cbsa_best_model)
## 53 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 8558.64296086404
## total_population 0.00005748036
## pct_female 2.64249225410
## pct_white -3.45508439291
## pct_black -3.97377644423
## pct_aian -3.56351837921
## pct_asian -2.76107999747
## pct_nhpi 22.12693667884
## pct_two_or_more -4.04460426753
## pct_hispanic 0.43562573520
## median_age 2.39759589186
## pct_under_18 -4.37355011855
## total_households -0.00043711653
## pct_married_couple -4.33307565435
## pct_male_householder -8.59148693367
## pct_female_householder -3.63143394876
## pct_unemployed -7.62822856157
## pct_below_poverty -0.97023858697
## pct_income_10_14999 1.03952918534
## pct_income_15_24999 6.29423663715
## pct_income_25_34999 3.20800639128
## pct_income_35_49999 4.32468728962
## pct_income_50_74999 3.45166318929
## pct_income_75_99999 0.00137717272
## pct_income_100_149k 4.64515410501
## pct_income_150_199k 1.82586284515
## pct_income_200k_plus 8.03131887836
## median_household_income 0.00417472933
## pct_high_school_plus -0.15404493456
## pct_bachelors_plus -0.67637316136
## total_housing_units 0.00028596733
## pct_vacant_units 4.36141351837
## pct_owner_occupied -3.98585115960
## pct_built_2020_plus 1.49258264384
## pct_built_2010_2019 0.58493218356
## pct_built_2000_2009 1.74238104171
## pct_built_1990_1999 5.34408170918
## pct_built_1980_1989 0.04760619616
## pct_built_1970_1979 4.42778994983
## pct_built_1960_1969 3.28417415527
## pct_built_1950_1959 1.44522674786
## pct_built_1940_1949 1.06215683361
## pct_1_bed -3.02376699786
## pct_2_bed -5.81814692770
## pct_3_bed -5.62966127322
## pct_4_bed -10.03801360160
## pct_5plus_bed -4.75030766219
## area_typeMetro -19.11903103655
## area_typeMicro 0.00000000117
## month_date_yyyymm -0.03785014164
## total_listing_count -0.00427729735
## pending_ratio -13.12349573252
## median_days_on_market 0.32987981969
cv_lasso_log_price = cv.glmnet(X_train_matrix, log_price_train, alpha = 1)
lasso_log_price_best_lambda = cv_lasso_log_price$lambda.min
lasso_log_price_best_lambda
## [1] 0.0000929
plot(cv_lasso_log_price)
lasso_log_price_best_model = glmnet(X_train_matrix, log_price_train, alpha = 1, lambda=lasso_log_price_best_lambda)
coef(lasso_log_price_best_model)
## 581 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -4.25e+02
## total_population .
## pct_female 1.82e-03
## pct_white -4.52e-03
## pct_black -7.74e-03
## pct_aian 2.55e-04
## pct_asian 7.46e-03
## pct_nhpi 1.01e-02
## pct_two_or_more .
## pct_hispanic 1.14e-03
## median_age 1.40e-02
## pct_under_18 -4.18e-04
## total_households 2.79e-09
## pct_married_couple .
## pct_male_householder -1.63e-02
## pct_female_householder -5.09e-03
## pct_unemployed 2.79e-02
## pct_below_poverty -1.48e-02
## pct_income_10_14999 -4.08e-02
## pct_income_15_24999 -1.18e-02
## pct_income_25_34999 -1.39e-02
## pct_income_35_49999 -1.61e-02
## pct_income_50_74999 -2.29e-02
## pct_income_75_99999 -2.08e-02
## pct_income_100_149k -3.23e-04
## pct_income_150_199k 1.02e-03
## pct_income_200k_plus 1.11e-02
## median_household_income .
## pct_high_school_plus .
## pct_bachelors_plus .
## total_housing_units .
## pct_vacant_units 1.25e-02
## pct_owner_occupied -9.68e-03
## pct_built_2020_plus 1.26e-02
## pct_built_2010_2019 .
## pct_built_2000_2009 2.40e-04
## pct_built_1990_1999 1.02e-02
## pct_built_1980_1989 2.56e-03
## pct_built_1970_1979 1.21e-02
## pct_built_1960_1969 -2.67e-03
## pct_built_1950_1959 -5.08e-03
## pct_built_1940_1949 -6.02e-04
## pct_1_bed 4.79e-05
## pct_2_bed 6.98e-03
## pct_3_bed -2.13e-06
## pct_4_bed -1.57e-02
## pct_5plus_bed 1.14e-03
## area_typeMetro -4.97e-03
## area_typeMicro 7.02e-17
## month_date_yyyymm 2.13e-03
## cbsa_titleAbilene, TX -4.88e-02
## cbsa_titleAdrian, MI 2.46e-02
## cbsa_titleAkron, OH -8.38e-02
## cbsa_titleAlamogordo, NM -1.48e-01
## cbsa_titleAlbany-Schenectady-Troy, NY 5.40e-02
## cbsa_titleAlbany, GA -2.45e-01
## cbsa_titleAlbany, OR 2.92e-01
## cbsa_titleAlbemarle, NC 1.48e-01
## cbsa_titleAlbertville, AL 1.18e-01
## cbsa_titleAlbuquerque, NM -3.59e-02
## cbsa_titleAlexandria, LA -2.24e-01
## cbsa_titleAllentown-Bethlehem-Easton, PA-NJ -2.07e-02
## cbsa_titleAltoona, PA -3.92e-01
## cbsa_titleAmarillo, TX -1.31e-01
## cbsa_titleAmes, IA 2.82e-01
## cbsa_titleAmherst Town-Northampton, MA 2.55e-01
## cbsa_titleAnchorage, AK -4.60e-01
## cbsa_titleAnderson Creek, NC 1.80e-01
## cbsa_titleAnn Arbor, MI -5.21e-02
## cbsa_titleAnniston-Oxford, AL -2.36e-01
## cbsa_titleAppleton, WI 1.77e-01
## cbsa_titleAsheville, NC 1.36e-01
## cbsa_titleAthens-Clarke County, GA 1.16e-01
## cbsa_titleAthens, TN .
## cbsa_titleAthens, TX -1.91e-02
## cbsa_titleAtlanta-Sandy Springs-Roswell, GA -1.61e-01
## cbsa_titleAtlantic City-Hammonton, NJ 2.51e-01
## cbsa_titleAuburn-Opelika, AL 1.13e-01
## cbsa_titleAuburn, NY -1.47e-01
## cbsa_titleAugusta-Richmond County, GA-SC -3.05e-02
## cbsa_titleAugusta-Waterville, ME 1.29e-01
## cbsa_titleAustin-Round Rock-San Marcos, TX -2.41e-01
## cbsa_titleBakersfield-Delano, CA 3.38e-02
## cbsa_titleBaltimore-Columbia-Towson, MD -1.02e-01
## cbsa_titleBangor, ME -1.73e-03
## cbsa_titleBaraboo, WI -2.03e-02
## cbsa_titleBarnstable Town, MA 3.31e-01
## cbsa_titleBaton Rouge, LA 6.39e-02
## cbsa_titleBattle Creek, MI 1.24e-01
## cbsa_titleBay City, MI 6.71e-03
## cbsa_titleBeaumont-Port Arthur, TX -1.99e-01
## cbsa_titleBeaver Dam, WI 1.06e-01
## cbsa_titleBeckley, WV -1.40e-01
## cbsa_titleBellingham, WA 2.19e-01
## cbsa_titleBend, OR -3.79e-02
## cbsa_titleBillings, MT 1.38e-01
## cbsa_titleBinghamton, NY -8.40e-02
## cbsa_titleBirmingham, AL -3.49e-02
## cbsa_titleBismarck, ND 9.43e-02
## cbsa_titleBlacksburg-Christiansburg-Radford, VA -6.77e-02
## cbsa_titleBloomington, IL -3.60e-01
## cbsa_titleBloomington, IN -1.59e-02
## cbsa_titleBloomsburg-Berwick, PA 7.33e-03
## cbsa_titleBluefield, WV-VA -3.37e-01
## cbsa_titleBoise City, ID 2.28e-01
## cbsa_titleBoston-Cambridge-Newton, MA-NH 2.37e-01
## cbsa_titleBoulder, CO 2.51e-01
## cbsa_titleBowling Green, KY -3.72e-02
## cbsa_titleBozeman, MT 4.31e-01
## cbsa_titleBrainerd, MN -7.31e-03
## cbsa_titleBremerton-Silverdale-Port Orchard, WA -1.66e-01
## cbsa_titleBridgeport-Stamford-Danbury, CT 1.49e-01
## cbsa_titleBrigham City, UT-ID 1.76e-01
## cbsa_titleBrownsville-Harlingen, TX .
## cbsa_titleBrunswick-St. Simons, GA 1.12e-01
## cbsa_titleBuffalo-Cheektowaga, NY 1.43e-01
## cbsa_titleBurlington-South Burlington, VT 1.85e-01
## cbsa_titleBurlington, NC .
## cbsa_titleCanton-Massillon, OH -1.04e-01
## cbsa_titleCape Coral-Fort Myers, FL -4.13e-01
## cbsa_titleCape Girardeau, MO-IL -2.67e-01
## cbsa_titleCarson City, NV -1.46e-01
## cbsa_titleCasper, WY 1.07e-01
## cbsa_titleCedar City, UT 2.65e-01
## cbsa_titleCedar Rapids, IA -2.53e-02
## cbsa_titleCentralia, WA 1.07e-01
## cbsa_titleChambersburg, PA -2.69e-03
## cbsa_titleChampaign-Urbana, IL -1.45e-01
## cbsa_titleCharleston-North Charleston, SC 2.10e-01
## cbsa_titleCharleston, WV -3.95e-01
## cbsa_titleCharlotte-Concord-Gastonia, NC-SC -6.12e-02
## cbsa_titleCharlottesville, VA 6.70e-04
## cbsa_titleChattanooga, TN-GA 1.26e-01
## cbsa_titleCheyenne, WY 1.83e-02
## cbsa_titleChicago-Naperville-Elgin, IL-IN -2.10e-01
## cbsa_titleChico, CA 9.00e-02
## cbsa_titleChillicothe, OH .
## cbsa_titleCincinnati, OH-KY-IN 2.98e-02
## cbsa_titleClarksburg, WV -1.88e-01
## cbsa_titleClarksville, TN-KY 8.98e-02
## cbsa_titleClearlake, CA 2.56e-01
## cbsa_titleCleveland, OH -5.33e-02
## cbsa_titleCleveland, TN -2.11e-01
## cbsa_titleClovis, NM .
## cbsa_titleCoeur d'Alene, ID 2.58e-01
## cbsa_titleCollege Station-Bryan, TX 3.18e-02
## cbsa_titleColorado Springs, CO .
## cbsa_titleColumbia, MO -7.55e-02
## cbsa_titleColumbia, SC .
## cbsa_titleColumbus, GA-AL -1.12e-01
## cbsa_titleColumbus, IN -1.43e-01
## cbsa_titleColumbus, MS -1.33e-01
## cbsa_titleColumbus, OH -4.51e-02
## cbsa_titleConcord, NH -7.81e-03
## cbsa_titleCookeville, TN 1.34e-01
## cbsa_titleCorbin, KY 1.57e-01
## cbsa_titleCorning, NY -3.83e-01
## cbsa_titleCorpus Christi, TX 3.01e-03
## cbsa_titleCorvallis, OR 1.41e-01
## cbsa_titleCrestview-Fort Walton Beach-Destin, FL 4.94e-01
## cbsa_titleCrossville, TN -1.72e-02
## cbsa_titleCullman, AL -3.06e-02
## cbsa_titleCumberland, MD-WV -2.02e-01
## cbsa_titleDallas-Fort Worth-Arlington, TX -2.96e-01
## cbsa_titleDalton, GA 1.61e-01
## cbsa_titleDanville, IL -6.08e-01
## cbsa_titleDanville, VA -1.04e-01
## cbsa_titleDaphne-Fairhope-Foley, AL 3.69e-01
## cbsa_titleDavenport-Moline-Rock Island, IA-IL -1.71e-01
## cbsa_titleDayton-Kettering-Beavercreek, OH -4.62e-02
## cbsa_titleDecatur, AL -1.26e-01
## cbsa_titleDecatur, IL -4.96e-01
## cbsa_titleDeltona-Daytona Beach-Ormond Beach, FL -2.55e-02
## cbsa_titleDenver-Aurora-Centennial, CO -8.14e-02
## cbsa_titleDes Moines-West Des Moines, IA 1.45e-01
## cbsa_titleDetroit-Warren-Dearborn, MI 7.92e-02
## cbsa_titleDothan, AL -9.60e-02
## cbsa_titleDover, DE 1.47e-01
## cbsa_titleDuBois, PA -1.42e-01
## cbsa_titleDubuque, IA 1.28e-01
## cbsa_titleDuluth, MN-WI 8.63e-02
## cbsa_titleDurham-Chapel Hill, NC -5.28e-03
## cbsa_titleEagle Pass, TX .
## cbsa_titleEast Stroudsburg, PA -1.70e-01
## cbsa_titleEau Claire, WI -8.61e-02
## cbsa_titleEl Centro, CA -5.78e-02
## cbsa_titleEl Paso, TX 5.13e-03
## cbsa_titleElizabethtown, KY -4.75e-02
## cbsa_titleElkhart-Goshen, IN -1.36e-01
## cbsa_titleElmira, NY -2.68e-01
## cbsa_titleEnid, OK -3.60e-01
## cbsa_titleErie, PA 7.44e-02
## cbsa_titleEugene-Springfield, OR 3.21e-01
## cbsa_titleEureka-Arcata, CA 3.97e-01
## cbsa_titleEvansville, IN -1.88e-01
## cbsa_titleFairbanks-College, AK -4.00e-01
## cbsa_titleFargo, ND-MN -2.20e-02
## cbsa_titleFaribault-Northfield, MN 1.04e-01
## cbsa_titleFarmington, MO -5.06e-02
## cbsa_titleFarmington, NM 3.58e-03
## cbsa_titleFayetteville-Springdale-Rogers, AR -1.85e-01
## cbsa_titleFayetteville, NC 1.71e-01
## cbsa_titleFindlay, OH -2.37e-01
## cbsa_titleFlagstaff, AZ 2.52e-01
## cbsa_titleFlint, MI -1.26e-02
## cbsa_titleFlorence-Muscle Shoals, AL -1.83e-01
## cbsa_titleFlorence, SC -1.36e-02
## cbsa_titleFond du Lac, WI 1.42e-01
## cbsa_titleForest City, NC 3.98e-02
## cbsa_titleFort Collins-Loveland, CO -5.01e-02
## cbsa_titleFort Payne, AL 2.23e-01
## cbsa_titleFort Smith, AR-OK -2.32e-01
## cbsa_titleFort Wayne, IN 8.82e-02
## cbsa_titleFrankfort, KY -3.40e-02
## cbsa_titleFresno, CA -1.65e-02
## cbsa_titleGadsden, AL 8.10e-02
## cbsa_titleGainesville, FL 2.26e-01
## cbsa_titleGainesville, GA -1.41e-01
## cbsa_titleGallup, NM -2.26e-01
## cbsa_titleGettysburg, PA -1.21e-02
## cbsa_titleGlens Falls, NY 1.37e-01
## cbsa_titleGoldsboro, NC 2.09e-01
## cbsa_titleGranbury, TX -1.36e-02
## cbsa_titleGrand Forks, ND-MN -1.41e-01
## cbsa_titleGrand Island, NE -4.38e-01
## cbsa_titleGrand Junction, CO 2.67e-01
## cbsa_titleGrand Rapids-Wyoming-Kentwood, MI 1.36e-01
## cbsa_titleGrants Pass, OR 1.96e-01
## cbsa_titleGreat Falls, MT 4.70e-01
## cbsa_titleGreeley, CO -1.04e-01
## cbsa_titleGreen Bay, WI 8.32e-02
## cbsa_titleGreeneville, TN 2.60e-01
## cbsa_titleGreenfield, MA 1.50e-01
## cbsa_titleGreensboro-High Point, NC 3.89e-02
## cbsa_titleGreenville-Anderson-Greer, SC -4.42e-02
## cbsa_titleGreenville, NC -4.58e-02
## cbsa_titleGreenwood, SC 2.71e-01
## cbsa_titleGulfport-Biloxi, MS 4.43e-02
## cbsa_titleHagerstown-Martinsburg, MD-WV 6.49e-02
## cbsa_titleHammond, LA 1.93e-01
## cbsa_titleHanford-Corcoran, CA -7.40e-02
## cbsa_titleHarrisburg-Carlisle, PA -8.84e-02
## cbsa_titleHarrisonburg, VA 1.89e-01
## cbsa_titleHartford-West Hartford-East Hartford, CT -9.16e-02
## cbsa_titleHattiesburg, MS 2.61e-02
## cbsa_titleHeber, UT 1.12e-01
## cbsa_titleHelena, MT 7.78e-02
## cbsa_titleHermiston-Pendleton, OR 2.07e-01
## cbsa_titleHermitage, PA 9.61e-02
## cbsa_titleHickory-Lenoir-Morganton, NC 3.71e-02
## cbsa_titleHilo-Kailua, HI 1.88e-02
## cbsa_titleHilton Head Island-Bluffton-Port Royal, SC 1.78e-01
## cbsa_titleHinesville, GA 5.97e-02
## cbsa_titleHobbs, NM -4.37e-01
## cbsa_titleHolland, MI 1.52e-01
## cbsa_titleHomosassa Springs, FL -4.01e-01
## cbsa_titleHot Springs, AR -3.91e-02
## cbsa_titleHouma-Bayou Cane-Thibodaux, LA -1.27e-01
## cbsa_titleHouston-Pasadena-The Woodlands, TX -3.57e-01
## cbsa_titleHuntington-Ashland, WV-KY-OH -3.14e-01
## cbsa_titleHuntsville, AL -2.35e-02
## cbsa_titleHuntsville, TX -6.76e-02
## cbsa_titleIdaho Falls, ID -1.50e-01
## cbsa_titleIndiana, PA -3.44e-01
## cbsa_titleIndianapolis-Carmel-Greenwood, IN -1.88e-01
## cbsa_titleIowa City, IA -2.64e-02
## cbsa_titleIthaca, NY -9.87e-04
## cbsa_titleJackson, MI -1.84e-02
## cbsa_titleJackson, MS 1.01e-01
## cbsa_titleJackson, TN 2.59e-01
## cbsa_titleJacksonville, FL 1.51e-02
## cbsa_titleJacksonville, NC 1.33e-01
## cbsa_titleJamestown-Dunkirk, NY -8.73e-02
## cbsa_titleJanesville-Beloit, WI 1.20e-01
## cbsa_titleJefferson City, MO -1.11e-01
## cbsa_titleJefferson, GA -1.38e-01
## cbsa_titleJohnson City, TN 2.56e-01
## cbsa_titleJohnstown, PA -3.52e-01
## cbsa_titleJonesboro, AR -4.74e-02
## cbsa_titleJoplin, MO-KS -9.15e-02
## cbsa_titleKahului-Wailuku, HI 3.36e-01
## cbsa_titleKalamazoo-Portage, MI -3.00e-02
## cbsa_titleKalispell, MT 6.84e-01
## cbsa_titleKankakee, IL -1.76e-01
## cbsa_titleKansas City, MO-KS .
## cbsa_titleKapaa, HI 3.95e-01
## cbsa_titleKeene, NH -2.24e-02
## cbsa_titleKennewick-Richland, WA 1.47e-01
## cbsa_titleKenosha, WI -2.19e-02
## cbsa_titleKey West-Key Largo, FL 3.00e-01
## cbsa_titleKilleen-Temple, TX 1.64e-01
## cbsa_titleKingsport-Bristol, TN-VA -1.24e-02
## cbsa_titleKingston, NY 1.69e-01
## cbsa_titleKiryas Joel-Poughkeepsie-Newburgh, NY 7.28e-02
## cbsa_titleKlamath Falls, OR 3.05e-02
## cbsa_titleKnoxville, TN 1.88e-01
## cbsa_titleKokomo, IN -1.52e-01
## cbsa_titleLa Crosse-Onalaska, WI-MN 3.03e-02
## cbsa_titleLaconia, NH -1.87e-01
## cbsa_titleLafayette-West Lafayette, IN 8.62e-02
## cbsa_titleLafayette, LA -5.38e-03
## cbsa_titleLaGrange, GA-AL 6.53e-02
## cbsa_titleLake Charles, LA -2.21e-01
## cbsa_titleLake City, FL 1.37e-01
## cbsa_titleLake Havasu City-Kingman, AZ -6.64e-03
## cbsa_titleLakeland-Winter Haven, FL 1.25e-02
## cbsa_titleLancaster, PA 2.68e-02
## cbsa_titleLansing-East Lansing, MI -1.77e-01
## cbsa_titleLaredo, TX -1.44e-01
## cbsa_titleLas Cruces, NM -5.87e-02
## cbsa_titleLas Vegas-Henderson-North Las Vegas, NV -1.57e-01
## cbsa_titleLaurel, MS 1.24e-01
## cbsa_titleLawrence, KS 5.97e-02
## cbsa_titleLawton, OK -2.38e-01
## cbsa_titleLebanon-Claremont, NH-VT 1.84e-02
## cbsa_titleLebanon, PA 8.70e-02
## cbsa_titleLewiston-Auburn, ME 3.08e-01
## cbsa_titleLewiston, ID-WA 1.95e-01
## cbsa_titleLexington Park, MD -3.20e-01
## cbsa_titleLexington-Fayette, KY -3.99e-03
## cbsa_titleLima, OH -9.48e-02
## cbsa_titleLincoln, NE -8.27e-03
## cbsa_titleLittle Rock-North Little Rock-Conway, AR -7.02e-02
## cbsa_titleLogan, UT-ID 1.45e-01
## cbsa_titleLongview-Kelso, WA 2.67e-01
## cbsa_titleLongview, TX -1.17e-01
## cbsa_titleLos Angeles-Long Beach-Anaheim, CA 4.52e-01
## cbsa_titleLouisville/Jefferson County, KY-IN -2.57e-02
## cbsa_titleLubbock, TX -2.51e-01
## cbsa_titleLufkin, TX -5.66e-02
## cbsa_titleLumberton, NC -4.78e-02
## cbsa_titleLynchburg, VA 1.63e-01
## cbsa_titleMacon-Bibb County, GA -2.38e-01
## cbsa_titleMadison, WI -6.91e-02
## cbsa_titleManchester-Nashua, NH -1.27e-01
## cbsa_titleManhattan, KS -2.73e-01
## cbsa_titleManitowoc, WI 4.93e-03
## cbsa_titleMankato, MN -1.56e-01
## cbsa_titleMansfield, OH 3.42e-02
## cbsa_titleMarinette, WI-MI -1.24e-01
## cbsa_titleMarion-Herrin, IL -5.15e-01
## cbsa_titleMarion, IN 1.21e-01
## cbsa_titleMarquette, MI 5.29e-02
## cbsa_titleMassena-Ogdensburg, NY -2.66e-01
## cbsa_titleMcAllen-Edinburg-Mission, TX -2.29e-01
## cbsa_titleMeadville, PA -1.45e-01
## cbsa_titleMedford, OR 1.75e-01
## cbsa_titleMemphis, TN-MS-AR 8.86e-02
## cbsa_titleMerced, CA 1.97e-01
## cbsa_titleMeridian, MS -4.72e-01
## cbsa_titleMiami-Fort Lauderdale-West Palm Beach, FL 1.01e-01
## cbsa_titleMichigan City-La Porte, IN 5.84e-02
## cbsa_titleMidland, MI -5.57e-03
## cbsa_titleMidland, TX -1.28e-01
## cbsa_titleMilwaukee-Waukesha, WI 6.60e-02
## cbsa_titleMinneapolis-St. Paul-Bloomington, MN-WI -1.47e-01
## cbsa_titleMinot, ND -1.94e-01
## cbsa_titleMissoula, MT 4.44e-01
## cbsa_titleMobile, AL 9.44e-02
## cbsa_titleModesto, CA 3.29e-02
## cbsa_titleMonroe, LA -5.13e-02
## cbsa_titleMonroe, MI 2.43e-01
## cbsa_titleMontgomery, AL 6.07e-03
## cbsa_titleMonticello, NY 1.12e-01
## cbsa_titleMorehead City, NC 8.37e-03
## cbsa_titleMorgantown, WV -1.31e-01
## cbsa_titleMorristown, TN 3.70e-01
## cbsa_titleMoses Lake, WA 2.29e-01
## cbsa_titleMount Airy, NC 1.19e-02
## cbsa_titleMount Pleasant, MI 1.29e-02
## cbsa_titleMount Vernon-Anacortes, WA 6.82e-02
## cbsa_titleMuncie, IN -2.41e-01
## cbsa_titleMurrells Inlet, SC 2.12e-01
## cbsa_titleMuskegon-Norton Shores, MI 3.71e-01
## cbsa_titleMuskogee, OK -2.58e-01
## cbsa_titleMyrtle Beach-Conway-North Myrtle Beach, SC -3.69e-02
## cbsa_titleNacogdoches, TX -1.28e-01
## cbsa_titleNapa, CA 3.39e-01
## cbsa_titleNaples-Marco Island, FL -2.79e-01
## cbsa_titleNashville-Davidson--Murfreesboro--Franklin, TN 6.67e-02
## cbsa_titleNew Bern, NC 1.81e-01
## cbsa_titleNew Haven, CT -5.60e-03
## cbsa_titleNew Iberia, LA -2.49e-01
## cbsa_titleNew Orleans-Metairie, LA 1.51e-01
## cbsa_titleNew Philadelphia-Dover, OH -1.38e-01
## cbsa_titleNew York-Newark-Jersey City, NY-NJ 4.38e-01
## cbsa_titleNiles, MI 1.40e-01
## cbsa_titleNorth Port-Bradenton-Sarasota, FL -3.53e-01
## cbsa_titleNorth Wilkesboro, NC 2.69e-01
## cbsa_titleNorwich-New London-Willimantic, CT 8.62e-02
## cbsa_titleOak Harbor, WA 4.71e-02
## cbsa_titleOcala, FL -2.11e-01
## cbsa_titleOdessa, TX -1.31e-01
## cbsa_titleOgden, UT 6.06e-03
## cbsa_titleOklahoma City, OK -5.08e-02
## cbsa_titleOlean, NY -3.48e-02
## cbsa_titleOlympia-Lacey-Tumwater, WA -1.02e-01
## cbsa_titleOmaha, NE-IA -9.24e-02
## cbsa_titleOpelousas, LA -1.96e-01
## cbsa_titleOrangeburg, SC -2.17e-02
## cbsa_titleOrlando-Kissimmee-Sanford, FL -5.05e-02
## cbsa_titleOshkosh-Neenah, WI 3.96e-02
## cbsa_titleOttawa, IL -2.67e-01
## cbsa_titleOwensboro, KY -1.26e-02
## cbsa_titleOwosso, MI -1.05e-01
## cbsa_titleOxford, MS -3.87e-02
## cbsa_titleOxnard-Thousand Oaks-Ventura, CA 2.67e-01
## cbsa_titlePaducah, KY-IL -3.52e-01
## cbsa_titlePalatka, FL 2.70e-01
## cbsa_titlePalm Bay-Melbourne-Titusville, FL 3.99e-02
## cbsa_titlePanama City-Panama City Beach, FL 1.29e-01
## cbsa_titleParkersburg-Vienna, WV -2.98e-01
## cbsa_titlePensacola-Ferry Pass-Brent, FL 1.05e-01
## cbsa_titlePeoria, IL -4.77e-01
## cbsa_titlePhiladelphia-Camden-Wilmington, PA-NJ-DE-MD 9.31e-02
## cbsa_titlePhoenix-Mesa-Chandler, AZ -8.01e-03
## cbsa_titlePikeville, KY -2.93e-01
## cbsa_titlePine Bluff, AR -4.93e-01
## cbsa_titlePinehurst-Southern Pines, NC -8.55e-03
## cbsa_titlePittsburgh, PA -5.66e-02
## cbsa_titlePittsfield, MA 3.75e-01
## cbsa_titlePlattsburgh, NY -1.06e-01
## cbsa_titlePocatello, ID 6.48e-02
## cbsa_titlePort Angeles, WA 6.84e-02
## cbsa_titlePort St. Lucie, FL -1.79e-01
## cbsa_titlePortland-South Portland, ME 2.63e-01
## cbsa_titlePortland-Vancouver-Hillsboro, OR-WA -7.16e-02
## cbsa_titlePortsmouth, OH .
## cbsa_titlePottsville, PA -4.17e-01
## cbsa_titlePrescott Valley-Prescott, AZ 5.45e-02
## cbsa_titleProvidence-Warwick, RI-MA 2.55e-01
## cbsa_titleProvo-Orem-Lehi, UT -1.52e-01
## cbsa_titlePueblo, CO 9.11e-02
## cbsa_titlePunta Gorda, FL -5.76e-01
## cbsa_titlePutnam, CT 1.28e-01
## cbsa_titleQuincy, IL-MO -3.06e-01
## cbsa_titleRacine-Mount Pleasant, WI 1.04e-01
## cbsa_titleRaleigh-Cary, NC -2.69e-01
## cbsa_titleRapid City, SD 2.05e-01
## cbsa_titleReading, PA -8.73e-02
## cbsa_titleRedding, CA 1.18e-01
## cbsa_titleReno, NV 1.62e-03
## cbsa_titleRexburg, ID 6.36e-02
## cbsa_titleRichmond-Berea, KY 1.29e-01
## cbsa_titleRichmond, IN 4.07e-03
## cbsa_titleRichmond, VA 1.18e-01
## cbsa_titleRifle, CO 1.06e+00
## cbsa_titleRio Grande City-Roma, TX .
## cbsa_titleRiverside-San Bernardino-Ontario, CA 6.53e-02
## cbsa_titleRoanoke Rapids, NC -9.17e-02
## cbsa_titleRoanoke, VA -3.45e-02
## cbsa_titleRochester, MN -1.46e-02
## cbsa_titleRochester, NY -6.85e-02
## cbsa_titleRockford, IL -3.03e-01
## cbsa_titleRocky Mount, NC 1.39e-02
## cbsa_titleRome, GA 9.01e-02
## cbsa_titleRoseburg, OR 3.18e-01
## cbsa_titleRussellville, AR -1.46e-01
## cbsa_titleSacramento-Roseville-Folsom, CA -4.09e-02
## cbsa_titleSaginaw, MI -2.89e-02
## cbsa_titleSalem, OH -7.60e-02
## cbsa_titleSalem, OR -1.24e-02
## cbsa_titleSalinas, CA 6.88e-01
## cbsa_titleSalisbury, MD 9.49e-02
## cbsa_titleSalt Lake City-Murray, UT -4.10e-02
## cbsa_titleSan Angelo, TX -3.64e-01
## cbsa_titleSan Antonio-New Braunfels, TX -2.33e-01
## cbsa_titleSan Diego-Chula Vista-Carlsbad, CA 1.83e-01
## cbsa_titleSan Francisco-Oakland-Fremont, CA 1.90e-02
## cbsa_titleSan Jose-Sunnyvale-Santa Clara, CA -1.28e-01
## cbsa_titleSan Luis Obispo-Paso Robles, CA 3.25e-01
## cbsa_titleSandusky, OH 6.07e-02
## cbsa_titleSanford, NC 1.33e-01
## cbsa_titleSanta Cruz-Watsonville, CA 4.97e-01
## cbsa_titleSanta Fe, NM 1.18e-01
## cbsa_titleSanta Maria-Santa Barbara, CA 9.48e-01
## cbsa_titleSanta Rosa-Petaluma, CA 1.88e-01
## cbsa_titleSavannah, GA 9.45e-02
## cbsa_titleScranton--Wilkes-Barre, PA -1.55e-01
## cbsa_titleSeaford, DE -2.29e-01
## cbsa_titleSearcy, AR 3.22e-02
## cbsa_titleSeattle-Tacoma-Bellevue, WA 6.86e-03
## cbsa_titleSebastian-Vero Beach-West Vero Corridor, FL -2.37e-01
## cbsa_titleSebring, FL -2.97e-01
## cbsa_titleSeneca, SC -3.69e-02
## cbsa_titleSevierville, TN 3.65e-01
## cbsa_titleShawnee, OK -8.16e-02
## cbsa_titleSheboygan, WI 4.78e-02
## cbsa_titleShelby-Kings Mountain, NC 6.96e-02
## cbsa_titleShelton, WA -1.24e-01
## cbsa_titleSherman-Denison, TX 2.29e-02
## cbsa_titleShow Low, AZ 3.70e-01
## cbsa_titleShreveport-Bossier City, LA 2.64e-02
## cbsa_titleSierra Vista-Douglas, AZ 1.11e-02
## cbsa_titleSioux City, IA-NE-SD -1.67e-01
## cbsa_titleSioux Falls, SD-MN 7.73e-02
## cbsa_titleSlidell-Mandeville-Covington, LA .
## cbsa_titleSomerset, KY 2.38e-01
## cbsa_titleSomerset, PA -1.23e-01
## cbsa_titleSouth Bend-Mishawaka, IN-MI 7.37e-03
## cbsa_titleSpartanburg, SC -1.05e-02
## cbsa_titleSpokane-Spokane Valley, WA 7.45e-02
## cbsa_titleSpringfield, IL -6.23e-01
## cbsa_titleSpringfield, MA 2.07e-01
## cbsa_titleSpringfield, MO -1.71e-02
## cbsa_titleSpringfield, OH -2.13e-02
## cbsa_titleSt. Cloud, MN 8.38e-03
## cbsa_titleSt. George, UT 1.40e-01
## cbsa_titleSt. Joseph, MO-KS -3.08e-01
## cbsa_titleSt. Louis, MO-IL -1.35e-01
## cbsa_titleState College, PA -5.76e-02
## cbsa_titleStatesboro, GA 5.32e-01
## cbsa_titleStaunton-Stuarts Draft, VA 2.80e-01
## cbsa_titleStevens Point-Plover, WI 4.48e-02
## cbsa_titleStillwater, OK -1.06e-02
## cbsa_titleStockton-Lodi, CA -7.65e-02
## cbsa_titleSumter, SC 2.06e-01
## cbsa_titleSunbury, PA -1.50e-01
## cbsa_titleSyracuse, NY 2.19e-03
## cbsa_titleTalladega-Sylacauga, AL 7.93e-02
## cbsa_titleTallahassee, FL 1.11e-01
## cbsa_titleTampa-St. Petersburg-Clearwater, FL -1.05e-01
## cbsa_titleTerre Haute, IN -2.59e-01
## cbsa_titleTexarkana, TX-AR -1.95e-01
## cbsa_titleThomasville, GA 2.08e-01
## cbsa_titleToledo, OH -4.59e-02
## cbsa_titleTopeka, KS -2.45e-01
## cbsa_titleTorrington, CT 1.68e-01
## cbsa_titleTraverse City, MI 2.05e-01
## cbsa_titleTrenton-Princeton, NJ -2.55e-01
## cbsa_titleTruckee-Grass Valley, CA -2.75e-01
## cbsa_titleTucson, AZ 2.32e-04
## cbsa_titleTullahoma-Manchester, TN 2.43e-01
## cbsa_titleTulsa, OK -1.52e-01
## cbsa_titleTupelo, MS 1.53e-01
## cbsa_titleTuscaloosa, AL 1.45e-01
## cbsa_titleTwin Falls, ID 2.42e-01
## cbsa_titleTyler, TX -1.69e-01
## cbsa_titleUkiah, CA 1.74e-01
## cbsa_titleUrban Honolulu, HI -5.94e-02
## cbsa_titleUtica-Rome, NY 7.50e-02
## cbsa_titleValdosta, GA 2.46e-01
## cbsa_titleVallejo, CA -4.15e-02
## cbsa_titleVictoria, TX -4.31e-02
## cbsa_titleVineland, NJ 5.56e-02
## cbsa_titleVirginia Beach-Chesapeake-Norfolk, VA-NC 1.58e-01
## cbsa_titleVisalia, CA -3.53e-02
## cbsa_titleWaco, TX 8.96e-02
## cbsa_titleWalla Walla, WA -3.37e-02
## cbsa_titleWarner Robins, GA -1.58e-01
## cbsa_titleWarsaw, IN -3.75e-03
## cbsa_titleWashington-Arlington-Alexandria, DC-VA-MD-WV -1.73e-01
## cbsa_titleWaterbury-Shelton, CT 1.83e-01
## cbsa_titleWaterloo-Cedar Falls, IA 5.21e-02
## cbsa_titleWatertown-Fort Atkinson, WI -6.87e-02
## cbsa_titleWatertown-Fort Drum, NY -2.94e-01
## cbsa_titleWausau, WI -1.76e-01
## cbsa_titleWeirton-Steubenville, WV-OH -2.44e-01
## cbsa_titleWenatchee-East Wenatchee, WA 9.28e-02
## cbsa_titleWheeling, WV-OH -1.18e-01
## cbsa_titleWhitewater-Elkhorn, WI -2.92e-02
## cbsa_titleWichita Falls, TX -3.15e-01
## cbsa_titleWichita, KS -1.43e-01
## cbsa_titleWildwood-The Villages, FL -4.37e-01
## cbsa_titleWilliamsport, PA -3.41e-03
## cbsa_titleWilmington, NC 7.66e-03
## cbsa_titleWilson, NC 5.41e-03
## cbsa_titleWinchester, VA-WV -5.27e-04
## cbsa_titleWinston-Salem, NC 2.36e-02
## cbsa_titleWisconsin Rapids-Marshfield, WI -1.17e-01
## cbsa_titleWooster, OH 1.02e-01
## cbsa_titleWorcester, MA 6.97e-02
## cbsa_titleYakima, WA 1.43e-01
## cbsa_titleYork-Hanover, PA -9.96e-03
## cbsa_titleYoungstown-Warren, OH 1.32e-03
## cbsa_titleYuba City, CA -1.88e-02
## cbsa_titleYuma, AZ 1.57e-02
## cbsa_titleZanesville, OH -8.62e-02
## total_listing_count -4.60e-07
## pending_ratio 2.17e-02
## median_days_on_market -9.51e-04
cv_lasso_log_price_no_cbsa = cv.glmnet(X_train_no_cbsa_matrix, log_price_train, alpha = 1)
lasso_log_price_no_cbsa_best_lambda = cv_lasso_log_price$lambda.min
lasso_log_price_no_cbsa_best_lambda
## [1] 0.0000929
plot(cv_lasso_log_price_no_cbsa)
lasso_log_price_no_cbsa_best_model = glmnet(X_train_no_cbsa_matrix, log_price_train, alpha = 1, lambda=lasso_log_price_no_cbsa_best_lambda)
coef(lasso_log_price_no_cbsa_best_model)
## 53 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -1.62e+02
## total_population 1.67e-08
## pct_female 2.23e-02
## pct_white -1.75e-02
## pct_black -2.13e-02
## pct_aian -1.43e-02
## pct_asian -1.82e-02
## pct_nhpi 2.14e-02
## pct_two_or_more -1.46e-02
## pct_hispanic -1.31e-03
## median_age 8.33e-03
## pct_under_18 -2.29e-02
## total_households .
## pct_married_couple -1.33e-02
## pct_male_householder -2.82e-02
## pct_female_householder -1.46e-02
## pct_unemployed -4.11e-03
## pct_below_poverty -7.15e-03
## pct_income_10_14999 1.37e-02
## pct_income_15_24999 1.55e-02
## pct_income_25_34999 1.42e-02
## pct_income_35_49999 1.83e-02
## pct_income_50_74999 1.68e-02
## pct_income_75_99999 1.94e-02
## pct_income_100_149k 3.41e-02
## pct_income_150_199k 3.99e-02
## pct_income_200k_plus 5.01e-02
## median_household_income 8.58e-08
## pct_high_school_plus -3.78e-04
## pct_bachelors_plus -1.19e-03
## total_housing_units 3.34e-08
## pct_vacant_units 1.10e-02
## pct_owner_occupied -1.36e-02
## pct_built_2020_plus 1.70e-02
## pct_built_2010_2019 2.01e-03
## pct_built_2000_2009 8.14e-03
## pct_built_1990_1999 1.94e-02
## pct_built_1980_1989 5.71e-03
## pct_built_1970_1979 1.65e-02
## pct_built_1960_1969 5.18e-03
## pct_built_1950_1959 5.10e-03
## pct_built_1940_1949 6.84e-03
## pct_1_bed -1.05e-02
## pct_2_bed -1.40e-02
## pct_3_bed -1.66e-02
## pct_4_bed -2.65e-02
## pct_5plus_bed -1.26e-02
## area_typeMetro -6.41e-02
## area_typeMicro 4.82e-13
## month_date_yyyymm 8.33e-04
## total_listing_count -1.18e-05
## pending_ratio -5.43e-02
## median_days_on_market 2.19e-04
## ---- RT_simple ----
options(scipen = 999) # turn off scientific notation
# # Keep only interpretable economic drivers
# inventory_model_simple <- full_dataset_clean %>%
# select(
# median_listing_price_per_square_foot,
# total_listing_count,
# pending_ratio,
# median_days_on_market
# )
#
# # Fit a simple, readable regression tree
# reg_tree_simple <- rpart(
# median_listing_price_per_square_foot ~
# total_listing_count + pending_ratio + median_days_on_market,
# data = inventory_model_simple,
# method = "anova",
# control = rpart.control(maxdepth = 3, minbucket = 200)
# )
#
# # Plot the tree
# rpart.plot(
# reg_tree_simple,
# type = 2,
# extra = 101,
# under = TRUE,
# fallen.leaves = TRUE,
# roundint = TRUE,
# main = "Regression Tree: Price per Sq.Ft"
# )
# Basic check: supply distribution
summary(full_dataset_clean$total_listing_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40 335 626 2090 1633 52936
#stephs expanded regression tree (median)
tree_data_med <- train %>%
select(-log_median_price_sqft,
-cbsa_title) %>%
na.omit()
set.seed(42)
reg_tree_exp_med <- rpart(median_listing_price_per_square_foot ~ .,
data = tree_data_med,
method = "anova",
control = rpart.control(minbucket = 200))
best_cp_med <- reg_tree_exp_med$cptable[which.min(reg_tree_exp_med$cptable[,"xerror"]), "CP"]
best_cp_med
## [1] 0.01
pruned_tree_data_med <- prune(reg_tree_exp_med,
cp = best_cp_med)
options(repr.plot.width = 28,
repr.plot.height = 14)
fancyRpartPlot(reg_tree_exp_med,
main = "Expanded Regression Tree: Median Price per Sq.Ft",
sub = paste0("n=", nrow(tree_data_med), " | maxdepth=", reg_tree_exp_med$control$maxdepth),
cex = 1.4,
tweak = 0.5)
set.seed(42)
rf_data_med <- train %>%
mutate(across(where(is.character), as.factor)) %>%
select(-log_median_price_sqft,-cbsa_title ) %>% #log(price) is basically price. ; cbsa can't be used anyway
na.omit() # Can not handle categorical predictors with more than 53 categories.
rf_exp_med <- randomForest(
median_listing_price_per_square_foot ~ .,
data = rf_data_med,
ntree = 500,
mtry = floor(sqrt(ncol(rf_data_med) - 1)),
nodesize = 15,
importance = TRUE,
na.action = na.omit
)
imp <- randomForest::importance(rf_exp_med)
imp_df <- data.frame(
Feature = rownames(imp),
Importance = imp[, "%IncMSE"]
)
imp_df %>%
top_n(25, Importance) %>% #top 25 indicators
ggplot(aes(x=reorder(Feature, Importance), y=Importance)) +
geom_col(fill="steelblue") +
coord_flip() +
labs(title="Top 25 Key Indicators",
x="Feature",
y="MSE-contribution (%)") +
theme_minimal(base_size = 13)
#### Random Forest with Log
set.seed(42)
rf_data_log <- train %>%
select(-median_listing_price_per_square_foot,-cbsa_title ) #log(price) is basically price. ; cbsa can't be used anyway # Can not handle categorical predictors with more than 53 categories.
rf_exp_log <- randomForest(
log_median_price_sqft ~ .,
data = rf_data_log,
ntree = 500,
mtry = floor(sqrt(ncol(rf_data_log) - 1)),
nodesize = 15,
importance = TRUE,
na.action = na.omit
)
imp_log <- randomForest::importance(rf_exp_log)
imp_log_df <- data.frame(
Feature = rownames(imp_log),
Importance = imp[, "%IncMSE"]
)
imp_log_df %>%
top_n(25, Importance) %>% #top 25 indicators
ggplot(aes(x=reorder(Feature, Importance), y=Importance)) +
geom_col(fill="steelblue") +
coord_flip() +
labs(title="Top 25 Key Indicators",
x="Feature",
y="MSE-contribution (%)") +
theme_minimal(base_size = 13)
```
#data manipulation EDA- summary statistics(baylee) Modeling- linear regressions (alex) Modeling- lasso regression (alex) – probably lots of options here Modeling- regression tree (steph) Modeling- regression forest (ilgaz)